From 4d221d67f1c41d660afc68e50d957dd61aa2344d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 14:57:02 -0700
Subject: [PATCH 001/262] commit

---
 .../triton_backend/bart/BartTritonModel.cc    | 401 ++++++++++++++++++
 .../triton_backend/bart/BartTritonModel.h     | 114 +++++
 .../bart/BartTritonModelInstance.cc           | 269 ++++++++++++
 .../bart/BartTritonModelInstance.h            |  94 ++++
 .../triton_backend/bart/CMakeLists.txt        |  25 ++
 5 files changed, 903 insertions(+)
 create mode 100644 src/fastertransformer/triton_backend/bart/BartTritonModel.cc
 create mode 100644 src/fastertransformer/triton_backend/bart/BartTritonModel.h
 create mode 100644 src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
 create mode 100644 src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h
 create mode 100644 src/fastertransformer/triton_backend/bart/CMakeLists.txt
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
new file mode 100644
index 000000000..e30c11a43
--- /dev/null
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/triton_backend/t5/T5TritonModel.h"
+#include "src/fastertransformer/triton_backend/t5/T5TritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/allocator.h"
+
+namespace ft = fastertransformer;
+
+std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createT5Model(std::string model_dir)
+{
+    INIReader reader = INIReader(model_dir + "/config.ini");
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << model_dir << "/config.ini"
+                  << "'\n";
+        return nullptr;
+    }
+
+    const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type");
+    if (data_type == "fp16") {
+        return std::make_shared<T5TritonModel<half>>(reader, model_dir);
+    }
+#ifdef ENABLE_BF16
+    else if (data_type == "bf16") {
+        return std::make_shared<T5TritonModel<__nv_bfloat16>>(reader, model_dir);
+    }
+#endif
+    else if (data_type == "fp32") {
+        return std::make_shared<T5TritonModel<float>>(reader, model_dir);
+    }
+    else {
+        FT_LOG_ERROR("Unsupported data type " + data_type);
+        exit(-1);
+    }
+}
+
+template<typename T>
+T5TritonModel<T>::T5TritonModel(INIReader reader, std::string model_dir): model_dir_(model_dir)
+{
+    // encoder
+    encoder_head_num_      = reader.GetInteger("encoder", "num_heads");
+    encoder_size_per_head_ = reader.GetInteger("encoder", "d_kv");
+    encoder_d_model_       = reader.GetInteger("encoder", "d_model");
+    encoder_inter_size_    = reader.GetInteger("encoder", "d_ff");
+    encoder_num_layer_     = reader.GetInteger("encoder", "num_layers");
+    encoder_vocab_size_    = reader.GetInteger("encoder", "vocab_size");
+    encoder_num_bucket_or_max_pos_seq_len_ =
+        reader.GetInteger("encoder", "relative_attention_num_buckets_or_max_pos_seq_len");
+    encoder_adapter_.interSize(reader.GetInteger("encoder", "adapter_inter_size", 0));
+    encoder_adapter_.layerNormType(reader.Get("encoder", "adapter_norm_position", "pre"));
+
+    // decoding
+    decoding_head_num_      = reader.GetInteger("decoder", "num_heads");
+    decoding_size_per_head_ = reader.GetInteger("decoder", "d_kv");
+    decoding_d_model_       = reader.GetInteger("decoder", "d_model");
+    decoding_inter_size_    = reader.GetInteger("decoder", "d_ff");
+    decoding_num_layer_     = reader.GetInteger("decoder", "num_layers");
+    decoding_vocab_size_    = reader.GetInteger("decoder", "vocab_size");
+    decoding_num_bucket_or_max_pos_seq_len_ =
+        reader.GetInteger("decoder", "relative_attention_num_buckets_or_max_pos_seq_len");
+    decoding_adapter_.interSize(reader.GetInteger("decoder", "adapter_inter_size", 0));
+    decoding_adapter_.layerNormType(reader.Get("decoder", "adapter_norm_position", "pre"));
+
+    start_id_                 = reader.GetInteger("decoder", "decoder_start_token_id");
+    end_id_                   = reader.GetInteger("decoder", "eos_token_id");
+    tensor_para_size_         = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
+    pipeline_para_size_       = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
+    enable_custom_all_reduce_ = reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0);
+    t5_with_bias_             = reader.GetBoolean("structure", "t5_with_bias", false);
+    use_gated_activation_     = reader.GetBoolean("structure", "use_gated_activation", false);
+    position_embedding_type_ =
+        ft::PositionEmbeddingType(reader.Get("structure", "position_embedding_type", "relative") == "relative" ? 0 : 1);
+    q_scaling_    = t5_with_bias_ ? 1.0f : (1.0f / (sqrt(encoder_size_per_head_) * 1.0f));
+    max_distance_ = 128;  // use default value of huggingface here
+}
+
+template<typename T>
+T5TritonModel<T>::T5TritonModel(size_t      tensor_para_size,
+                                size_t      pipeline_para_size,
+                                int         enable_custom_all_reduce,
+                                std::string model_dir,
+                                int         int8_mode):
+    tensor_para_size_(tensor_para_size),
+    pipeline_para_size_(pipeline_para_size),
+    encoder_shared_weights_(std::vector<std::shared_ptr<ft::T5EncoderWeight<T>>>(ft::getDeviceCount())),
+    decoding_shared_weights_(std::vector<std::shared_ptr<ft::T5DecodingWeight<T>>>(ft::getDeviceCount())),
+    enable_custom_all_reduce_(enable_custom_all_reduce),
+    model_dir_(model_dir),
+    int8_mode_(int8_mode)
+{
+    INIReader reader = INIReader(model_dir + "/config.ini");
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << model_dir << "/config.ini"
+                  << "'\n";
+        ft::FT_CHECK(false);
+    }
+
+    ft::FT_CHECK(int8_mode_ == 0);
+
+    model_name_ = reader.Get("encoder", "_name_or_path");
+    // encoder
+    encoder_head_num_      = reader.GetInteger("encoder", "num_heads");
+    encoder_size_per_head_ = reader.GetInteger("encoder", "d_kv");
+    encoder_d_model_       = reader.GetInteger("encoder", "d_model");
+    encoder_inter_size_    = reader.GetInteger("encoder", "d_ff");
+    encoder_num_layer_     = reader.GetInteger("encoder", "num_layers");
+    encoder_vocab_size_    = reader.GetInteger("encoder", "vocab_size");
+    encoder_num_bucket_or_max_pos_seq_len_ =
+        reader.GetInteger("encoder", "relative_attention_num_buckets_or_max_pos_seq_len");
+    encoder_adapter_.interSize(reader.GetInteger("encoder", "adapter_inter_size", 0));
+    encoder_adapter_.layerNormType(reader.Get("encoder", "adapter_norm_position", "pre"));
+
+    // encoder prompt
+    num_tasks_                = reader.GetInteger("encoder", "num_tasks", 0);
+    prompt_learning_start_id_ = reader.GetInteger("encoder", "prompt_learning_start_id", encoder_vocab_size_ + 1);
+    prompt_learning_type_ =
+        static_cast<ft::PromptLearningType>(reader.GetInteger("encoder", "prompt_learning_type", 0));
+
+    for (int task_name_id = 0; task_name_id < num_tasks_; task_name_id++) {
+        std::string config_task_name = "task_" + std::to_string(task_name_id);
+        std::string task_name        = reader.Get(config_task_name, "task_name");
+        const int   prompt_length    = reader.GetInteger(config_task_name, "prompt_length", 0);
+        prompt_learning_table_pair_.insert({task_name, {task_name_id, prompt_length}});
+    }
+
+    // decoding
+    decoding_head_num_      = reader.GetInteger("decoder", "num_heads");
+    decoding_size_per_head_ = reader.GetInteger("decoder", "d_kv");
+    decoding_d_model_       = reader.GetInteger("decoder", "d_model");
+    decoding_inter_size_    = reader.GetInteger("decoder", "d_ff");
+    decoding_num_layer_     = reader.GetInteger("decoder", "num_layers");
+    decoding_vocab_size_    = reader.GetInteger("decoder", "vocab_size");
+    decoding_num_bucket_or_max_pos_seq_len_ =
+        reader.GetInteger("decoder", "relative_attention_num_buckets_or_max_pos_seq_len");
+    decoding_adapter_.interSize(reader.GetInteger("decoder", "adapter_inter_size", 0));
+    decoding_adapter_.layerNormType(reader.Get("decoder", "adapter_norm_position", "pre"));
+
+    start_id_            = reader.GetInteger("decoder", "decoder_start_token_id");
+    end_id_              = reader.GetInteger("decoder", "eos_token_id");
+    tie_word_embeddings_ = reader.GetBoolean("decoder", "tie_word_embeddings", true);
+
+    // common settings
+    t5_with_bias_         = reader.GetBoolean("structure", "t5_with_bias", false);
+    use_gated_activation_ = reader.GetBoolean("structure", "use_gated_activation", false);
+    activation_type_      = ft::getActivationType(reader.Get("encoder", "feed_forward_proj"));
+    position_embedding_type_ =
+        ft::PositionEmbeddingType(reader.Get("structure", "position_embedding_type", "relative") == "relative" ? 0 : 1);
+    q_scaling_ = t5_with_bias_ ? 1.0f : (1.0f / (sqrt(encoder_size_per_head_) * 1.0f));
+
+    ia3_num_tasks_ = reader.GetInteger("structure", "ia3_num_tasks", 0);
+
+    max_distance_ = 128;  // use default value of huggingface here
+}
+
+template<typename T>
+std::unique_ptr<AbstractTransformerModelInstance>
+T5TritonModel<T>::createModelInstance(int                                                               device_id,
+                                      int                                                               rank,
+                                      cudaStream_t                                                      stream,
+                                      std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                                      std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
+
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator(
+        new ft::Allocator<ft::AllocatorType::CUDA>(device_id));
+
+    allocator->setStream(stream);
+
+    cublasHandle_t   cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+
+    cublasCreate(&cublas_handle);
+    cublasLtCreate(&cublaslt_handle);
+    cublasSetStream(cublas_handle, stream);
+
+    std::unique_ptr<ft::cublasAlgoMap>   cublas_algo_map(new ft::cublasAlgoMap("gemm_config.in"));
+    std::unique_ptr<std::mutex>          cublas_wrapper_mutex(new std::mutex());
+    std::unique_ptr<ft::cublasMMWrapper> cublas_wrapper(new ft::cublasMMWrapper(
+        cublas_handle, cublaslt_handle, stream, cublas_algo_map.get(), cublas_wrapper_mutex.get(), allocator.get()));
+
+    std::unique_ptr<cudaDeviceProp> cuda_device_prop_ptr(new cudaDeviceProp);
+    ft::check_cuda_error(cudaGetDeviceProperties(cuda_device_prop_ptr.get(), device_id));
+
+    if (std::is_same<T, half>::value) {
+        cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        cublas_wrapper->setBF16GemmConfig();
+    }
+#endif
+    else if (std::is_same<T, float>::value) {
+        cublas_wrapper->setFP32GemmConfig();
+    }
+
+    const int sm_ = ft::getSMVersion();
+
+    // TODO(bhsueh) not support fused mha
+    // NOTE: fmha doesn't support t5-style relative position bias
+    ft::AttentionType attention_type =
+        ft::getAttentionType<T>(encoder_size_per_head_, sm_, true, encoder_num_bucket_or_max_pos_seq_len_, false);
+
+    ft::NcclParam tensor_para_   = nccl_params.first[comms_rank];
+    ft::NcclParam pipeline_para_ = nccl_params.second[comms_rank];
+
+    auto encoder = std::make_unique<ft::T5Encoder<T>>(ft::T5Encoder<T>(0,
+                                                                       0,
+                                                                       encoder_head_num_,
+                                                                       encoder_size_per_head_,
+                                                                       encoder_inter_size_,
+                                                                       encoder_d_model_,
+                                                                       encoder_num_layer_,
+                                                                       encoder_num_bucket_or_max_pos_seq_len_,
+                                                                       0,  // expert_num
+                                                                       max_distance_,
+                                                                       0,  // moe_k
+                                                                       sm_,
+                                                                       q_scaling_,
+                                                                       {},  // moe_layer_index
+                                                                       stream,
+                                                                       cublas_wrapper.get(),
+                                                                       allocator.get(),
+                                                                       false,
+                                                                       attention_type,
+                                                                       false,
+                                                                       activation_type_,
+                                                                       ft::LayerNormType::pre_layernorm,
+                                                                       tensor_para_,
+                                                                       pipeline_para_,
+                                                                       prompt_learning_start_id_,
+                                                                       prompt_learning_type_,
+                                                                       custom_all_reduce_comm,
+                                                                       enable_custom_all_reduce_,
+                                                                       encoder_adapter_));
+
+    auto decoding = std::make_unique<ft::T5Decoding<T>>(ft::T5Decoding<T>(0,
+                                                                          0,
+                                                                          0,
+                                                                          0,
+                                                                          decoding_head_num_,
+                                                                          decoding_size_per_head_,
+                                                                          decoding_inter_size_,
+                                                                          decoding_d_model_,
+                                                                          decoding_num_layer_,
+                                                                          decoding_vocab_size_,
+                                                                          decoding_num_bucket_or_max_pos_seq_len_,
+                                                                          0,  // expert_num
+                                                                          max_distance_,
+                                                                          0,  // moe_k
+                                                                          q_scaling_,
+                                                                          start_id_,
+                                                                          end_id_,
+                                                                          0.0f,  // beam_search_diversity_rate_,
+                                                                          1,     // top_k_,
+                                                                          0.0f,  // top_p_,
+                                                                          1.0f,  // temperature_,
+                                                                          0.0f,  // len_penalty_,
+                                                                          1.0f,  // repetition_penalty_,
+                                                                          {},    // moe_layer_index
+                                                                          stream,
+                                                                          cublas_wrapper.get(),
+                                                                          allocator.get(),
+                                                                          false,
+                                                                          cuda_device_prop_ptr.get(),
+                                                                          tensor_para_,
+                                                                          pipeline_para_,
+                                                                          activation_type_,
+                                                                          tie_word_embeddings_,
+                                                                          custom_all_reduce_comm,
+                                                                          enable_custom_all_reduce_,
+                                                                          decoding_adapter_));
+
+    return std::unique_ptr<T5TritonModelInstance<T>>(new T5TritonModelInstance<T>(std::move(encoder),
+                                                                                  std::move(decoding),
+                                                                                  encoder_shared_weights_[device_id],
+                                                                                  decoding_shared_weights_[device_id],
+                                                                                  std::move(allocator),
+                                                                                  std::move(cublas_algo_map),
+                                                                                  std::move(cublas_wrapper_mutex),
+                                                                                  std::move(cublas_wrapper),
+                                                                                  std::move(cuda_device_prop_ptr)));
+}
+
+template<typename T>
+void T5TritonModel<T>::createSharedWeights(int device_id, int rank)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    const int tensor_para_rank   = rank % tensor_para_size_;
+    const int pipeline_para_rank = rank / tensor_para_size_;
+
+    encoder_shared_weights_[device_id] =
+        std::make_shared<ft::T5EncoderWeight<T>>(encoder_head_num_,
+                                                 encoder_size_per_head_,
+                                                 encoder_d_model_,
+                                                 encoder_inter_size_,
+                                                 encoder_vocab_size_,
+                                                 encoder_num_layer_,
+                                                 encoder_num_bucket_or_max_pos_seq_len_,
+                                                 tensor_para_size_,
+                                                 tensor_para_rank,
+                                                 pipeline_para_size_,
+                                                 pipeline_para_rank,
+                                                 t5_with_bias_,
+                                                 use_gated_activation_,
+                                                 position_embedding_type_,
+                                                 prompt_learning_type_,
+                                                 prompt_learning_table_pair_,
+                                                 ia3_num_tasks_,
+                                                 encoder_adapter_.interSize());
+
+    decoding_shared_weights_[device_id] =
+        std::make_shared<ft::T5DecodingWeight<T>>(decoding_head_num_,
+                                                  decoding_size_per_head_,
+                                                  decoding_d_model_,
+                                                  decoding_inter_size_,
+                                                  decoding_vocab_size_,
+                                                  decoding_num_layer_,
+                                                  encoder_d_model_,
+                                                  decoding_num_bucket_or_max_pos_seq_len_,
+                                                  tensor_para_size_,
+                                                  tensor_para_rank,
+                                                  pipeline_para_size_,
+                                                  pipeline_para_rank,
+                                                  t5_with_bias_,
+                                                  use_gated_activation_,
+                                                  position_embedding_type_,
+                                                  ia3_num_tasks_,
+                                                  decoding_adapter_.interSize());
+
+    encoder_shared_weights_[device_id]->loadModel(model_dir_);
+    decoding_shared_weights_[device_id]->loadModel(model_dir_);
+}
+
+template<typename T>
+std::string T5TritonModel<T>::toString()
+{
+    std::stringstream ss;
+    std::string       position_embedding_type_string =
+        position_embedding_type_ == ft::PositionEmbeddingType::relative ? "relative" : "absolute";
+
+    ss << "\nModel: "
+       << "\n    encoder_head_num_: " << encoder_head_num_ << "\n    encoder_size_per_head_: " << encoder_size_per_head_
+       << "\n    encoder_d_model_: " << encoder_d_model_ << "\n    encoder_inter_size_: " << encoder_inter_size_
+       << "\n    encoder_num_layer_: " << encoder_num_layer_ << "\n    encoder_vocab_size_: " << encoder_vocab_size_
+       << "\n    encoder_num_bucket_or_max_pos_seq_len_: " << encoder_num_bucket_or_max_pos_seq_len_
+       << "\n    encoder_adapter_: " << encoder_adapter_.toString()
+       << "\n    decoding_head_num_: " << decoding_head_num_
+       << "\n    decoding_size_per_head_: " << decoding_size_per_head_
+       << "\n    decoding_d_model_: " << decoding_d_model_ << "\n    decoding_inter_size_: " << decoding_inter_size_
+       << "\n    decoding_num_layer_: " << decoding_num_layer_ << "\n    decoding_vocab_size_: " << decoding_vocab_size_
+       << "\n    decoding_num_bucket_or_max_pos_seq_len_: " << decoding_num_bucket_or_max_pos_seq_len_
+       << "\n    decoding_adapter: " << decoding_adapter_.toString() << "\n    t5_with_bias_: " << t5_with_bias_
+       << "\n    use_gated_activation_: " << use_gated_activation_
+       << "\n   position_embedding_type_: " << position_embedding_type_string << "\n    start_id_: " << start_id_
+       << "\n    end_id_: " << end_id_ << "\n    model_name_: " << model_name_ << "\n    model_dir_: " << model_dir_
+       << std::endl;
+
+    return ss.str();
+}
+
+template<typename T>
+void T5TritonModel<T>::createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
+                                         int                                                   world_size)
+{
+    using commDataType = typename ft::CustomARCommTypeConverter<T>::Type;
+    ft::initCustomAllReduceComm<commDataType>(custom_all_reduce_comms, enable_custom_all_reduce_, world_size);
+}
+
+template<typename T>
+int T5TritonModel<T>::getTensorParaSize()
+{
+    return tensor_para_size_;
+}
+
+template<typename T>
+int T5TritonModel<T>::getPipelineParaSize()
+{
+    return pipeline_para_size_;
+}
+
+template struct T5TritonModel<float>;
+template struct T5TritonModel<half>;
+#ifdef ENABLE_BF16
+template struct T5TritonModel<__nv_bfloat16>;
+#endif
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.h b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
new file mode 100644
index 000000000..1ffe0a407
--- /dev/null
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "3rdparty/INIReader.h"
+#include "src/fastertransformer/models/bart/BartDecoding.h"
+#include "src/fastertransformer/models/bart/BartEncoder.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <cuda_fp16.h>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+struct BartTritonModel: public AbstractTransformerModel {
+    BartTritonModel(INIReader reader, std::string model_dir);
+
+    BartTritonModel(size_t      tensor_para_size,
+                  size_t      pipeline_para_size,
+                  int         enable_custom_all_reduce,
+                  std::string model_dir,
+                  int         int8_mode);
+
+    ~BartTritonModel() = default;
+
+    virtual std::unique_ptr<AbstractTransformerModelInstance>
+    createModelInstance(int                                                               deviceId,
+                        int                                                               rank,
+                        cudaStream_t                                                      stream,
+                        std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr);
+
+    virtual void createSharedWeights(int deviceId, int rank) override;
+
+    virtual void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
+                                   int                                                   world_size) override;
+
+    virtual std::string toString() override;
+    virtual int         getTensorParaSize() override;
+    virtual int         getPipelineParaSize() override;
+
+private:
+    // encoder
+    size_t                  encoder_head_num_;
+    size_t                  encoder_size_per_head_;
+    size_t                  encoder_d_model_;
+    size_t                  encoder_inter_size_;
+    size_t                  encoder_num_layer_;
+    size_t                  encoder_vocab_size_;
+    size_t                  encoder_num_bucket_or_max_pos_seq_len_;
+    ft::LinearAdapterConfig encoder_adapter_{};
+
+    // prompt for encoder
+    size_t                                     num_tasks_                  = 0;
+    int                                        prompt_learning_start_id_   = 0;
+    ft::PromptLearningType                     prompt_learning_type_       = ft::PromptLearningType::no_prompt;
+    std::map<std::string, std::pair<int, int>> prompt_learning_table_pair_ = {};
+
+    // decoding
+    size_t                  decoding_head_num_;
+    size_t                  decoding_size_per_head_;
+    size_t                  decoding_d_model_;
+    size_t                  decoding_inter_size_;
+    size_t                  decoding_num_layer_;
+    size_t                  decoding_vocab_size_;
+    size_t                  decoding_num_bucket_or_max_pos_seq_len_;
+    ft::LinearAdapterConfig decoding_adapter_{};
+
+    float  q_scaling_;
+    size_t ia3_num_tasks_;
+
+    size_t max_distance_;
+    int    start_id_;
+    int    end_id_;
+
+    bool tie_word_embeddings_;
+
+    size_t tensor_para_size_;
+    size_t pipeline_para_size_;
+
+    // shared weights for each device
+    std::vector<std::shared_ptr<ft::BartEncoderWeight<T>>>  encoder_shared_weights_;
+    std::vector<std::shared_ptr<ft::BartDecodingWeight<T>>> decoding_shared_weights_;
+
+    // bart structure difference
+    bool                      bart_with_bias_;
+    bool                      use_gated_activation_;
+    ft::PositionEmbeddingType position_embedding_type_;
+    ft::ActivationType        activation_type_;
+
+    bool is_fp16_;
+    int  int8_mode_;
+
+    int enable_custom_all_reduce_ = 0;
+
+    std::string model_name_;
+    std::string model_dir_;
+};
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
new file mode 100644
index 000000000..05e81e253
--- /dev/null
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/triton_backend/t5/T5TritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/triton_backend/triton_utils.hpp"
+#include "src/fastertransformer/utils/Tensor.h"
+#include <vector>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+void triton_stream_callback(ft::TensorMap* output_tensors, void* ctx)
+{
+    auto* const model  = reinterpret_cast<T5TritonModelInstance<T>*>(ctx);
+    auto const  result = T5TritonModelInstance<T>::convert_outputs(*output_tensors);
+
+    model->stream_cb_(result, model->stream_ctx_);
+}
+
+template<typename T>
+T5TritonModelInstance<T>::T5TritonModelInstance(std::unique_ptr<ft::T5Encoder<T>>        t5_encoder,
+                                                std::unique_ptr<ft::T5Decoding<T>>       t5_decoding,
+                                                std::shared_ptr<ft::T5EncoderWeight<T>>  t5_encoder_weight,
+                                                std::shared_ptr<ft::T5DecodingWeight<T>> t5_decoding_weight,
+                                                std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
+                                                std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map,
+                                                std::unique_ptr<std::mutex>          cublas_wrapper_mutex,
+                                                std::unique_ptr<ft::cublasMMWrapper> cublas_wrapper,
+                                                std::unique_ptr<cudaDeviceProp>      cuda_device_prop_ptr):
+    t5_encoder_(std::move(t5_encoder)),
+    t5_decoding_(std::move(t5_decoding)),
+    t5_encoder_weight_(t5_encoder_weight),
+    t5_decoding_weight_(t5_decoding_weight),
+    allocator_(std::move(allocator)),
+    cublas_algo_map_(std::move(cublas_algo_map)),
+    cublas_wrapper_mutex_(std::move(cublas_wrapper_mutex)),
+    cublas_wrapper_(std::move(cublas_wrapper)),
+    cuda_device_prop_ptr_(std::move(cuda_device_prop_ptr))
+{
+}
+
+template<typename T>
+ft::TensorMap
+T5TritonModelInstance<T>::convert_inputs(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
+{
+    move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
+    move_tensor_H2D(input_tensors->at("sequence_length"), d_input_lengths_, &allocator_);
+
+    ft::TensorMap ft_input_tensors(
+        {{"input_ids", as_GPU_tensor(input_tensors->at("input_ids"), d_input_ids_)},
+         {"sequence_length", as_GPU_tensor(input_tensors->at("sequence_length"), d_input_lengths_)}});
+
+    if (input_tensors->count("prompt_learning_task_name_ids")) {
+        ft_input_tensors.insert({"prompt_learning_task_name_ids",
+                                 input_tensors->at("prompt_learning_task_name_ids").convertTritonTensorToFt()});
+    }
+    if (input_tensors->count("request_prompt_lengths")) {
+        move_tensor_H2D(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_, &allocator_);
+        ft_input_tensors.insert(
+            {"request_prompt_lengths",
+             as_GPU_tensor(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_)});
+    }
+    if (input_tensors->count("request_prompt_embedding")) {
+        move_tensor_H2D(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_, &allocator_);
+        ft_input_tensors.insert(
+            {"request_prompt_embedding",
+             as_GPU_tensor(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_)});
+    }
+    if (input_tensors->count("ia3_tasks")) {
+        ft_input_tensors.insert({"ia3_tasks", as_GPU_tensor(input_tensors->at("ia3_tasks"), d_input_ia3_tasks_)});
+    }
+    return ft_input_tensors;
+}
+
+template<typename T>
+std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+T5TritonModelInstance<T>::convert_outputs(ft::TensorMap& output_tensors)
+{
+    std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
+        new std::unordered_map<std::string, triton::Tensor>();
+
+    for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) {
+        outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)});
+    }
+
+    return std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(outputs_mapping);
+}
+
+template<typename T>
+std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+T5TritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
+{
+    const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
+    const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];
+    const size_t max_output_len     = *((uint*)input_tensors->at("max_output_len").data);
+    const size_t beam_width =
+        input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1;
+    const bool has_ia3_tasks = input_tensors->count("ia3_tasks");
+
+    allocateBuffer(request_batch_size, beam_width, max_output_len, mem_max_seq_len);
+
+    if (has_ia3_tasks) {
+        move_tensor_H2D(input_tensors->at("ia3_tasks"), d_input_ia3_tasks_, &allocator_);
+    }
+
+    ft::TensorMap encoder_input_tensors(convert_inputs(input_tensors));
+
+    ft::TensorMap encoder_output_tensors(
+        {{"output_hidden_state",
+          ft::Tensor{ft::MEMORY_GPU,
+                     ft::getTensorType<T>(),
+                     std::vector<size_t>{request_batch_size, mem_max_seq_len, t5_encoder_->getDModel()},
+                     d_encoder_outputs_}}});
+
+    ft::TensorMap decoding_input_tensors({{"encoder_output", encoder_output_tensors.at("output_hidden_state")},
+                                          {"encoder_sequence_length", encoder_input_tensors.at("sequence_length")}});
+
+    if (input_tensors->find("top_p_decay") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_decay"), d_top_p_decay_, &allocator_);
+        decoding_input_tensors.insert({"top_p_decay", as_GPU_tensor(input_tensors->at("top_p_decay"), d_top_p_decay_)});
+    }
+    if (input_tensors->find("top_p_min") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_min"), d_top_p_min_, &allocator_);
+        decoding_input_tensors.insert({"top_p_min", as_GPU_tensor(input_tensors->at("top_p_min"), d_top_p_min_)});
+    }
+    if (input_tensors->find("top_p_reset_ids") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_, &allocator_);
+        decoding_input_tensors.insert(
+            {"top_p_reset_ids", as_GPU_tensor(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_)});
+    }
+
+    std::set<std::string> keys_on_gpu = {"input_ids",
+                                         "sequence_length",
+                                         "bad_words_list",
+                                         "stop_words_list",
+                                         "ia3_tasks",
+                                         "top_p_decay",
+                                         "top_p_min",
+                                         "top_p_reset_ids"};
+    for (auto& t : *input_tensors) {
+        if (keys_on_gpu.count(t.first) == 0) {
+            decoding_input_tensors.insert({t.first, t.second.convertTritonTensorToFt()});
+        }
+    }
+
+    if (input_tensors->find("bad_words_list") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("bad_words_list"), d_input_bad_words_, &allocator_);
+        decoding_input_tensors.insert(
+            {"bad_words_list", as_GPU_tensor(input_tensors->at("bad_words_list"), d_input_bad_words_)});
+    }
+
+    if (input_tensors->find("stop_words_list") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("stop_words_list"), d_input_stop_words_, &allocator_);
+        decoding_input_tensors.insert(
+            {"stop_words_list", as_GPU_tensor(input_tensors->at("stop_words_list"), d_input_stop_words_)});
+    }
+
+    ft::TensorMap decoding_output_tensors(
+        {{"output_ids",
+          ft::Tensor{ft::MEMORY_GPU,
+                     ft::TYPE_INT32,
+                     std::vector<size_t>{request_batch_size, beam_width, max_output_len},
+                     d_output_ids_}},
+         {"sequence_length",
+          ft::Tensor{ft::MEMORY_GPU,
+                     ft::TYPE_INT32,
+                     std::vector<size_t>{request_batch_size, beam_width},
+                     d_sequence_lengths_}}});
+    if (input_tensors->count("is_return_log_probs") > 0
+        && input_tensors->at("is_return_log_probs").convertTritonTensorToFt().getVal<bool>()) {
+        decoding_output_tensors.insert({"output_log_probs",
+                                        ft::Tensor{ft::MEMORY_GPU,
+                                                   ft::TYPE_FP32,
+                                                   std::vector<size_t>{request_batch_size, beam_width, max_output_len},
+                                                   d_output_log_probs_}});
+        decoding_output_tensors.insert({"cum_log_probs",
+                                        ft::Tensor{ft::MEMORY_GPU,
+                                                   ft::TYPE_FP32,
+                                                   std::vector<size_t>{request_batch_size, beam_width},
+                                                   d_cum_log_probs_}});
+    }
+
+    if (has_ia3_tasks) {
+        const auto num_ia3_tasks = t5_encoder_weight_->getNumIA3Tasks();
+        FT_CHECK_WITH_INFO(num_ia3_tasks > 0, "Cannot request ia3_tasks, model has no IA3 adapters");
+        const bool is_within_range = ft::invokeCheckRange<int>(
+            d_input_ia3_tasks_, request_batch_size, 0, num_ia3_tasks - 1, d_within_range_, t5_encoder_->getStream());
+        FT_CHECK_WITH_INFO(is_within_range,
+                           ft::fmtstr("Requested IA3 tasks aren't in the range [0, %d).", num_ia3_tasks));
+
+        decoding_input_tensors.insert({"ia3_tasks", as_GPU_tensor(input_tensors->at("ia3_tasks"), d_input_ia3_tasks_)});
+    }
+
+    try {
+        if (stream_cb_ != nullptr) {
+            t5_decoding_->registerCallback(triton_stream_callback<T>, this);
+        }
+
+        t5_encoder_->forward(&encoder_output_tensors, &encoder_input_tensors, t5_encoder_weight_.get());
+        t5_decoding_->forward(&decoding_output_tensors, &decoding_input_tensors, t5_decoding_weight_.get());
+
+        if (stream_cb_ != nullptr) {
+            t5_decoding_->unRegisterCallback();
+        }
+    }
+    catch (...) {
+        h_exception_ = std::current_exception();
+        decoding_output_tensors.insert(
+            {"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}});
+    }
+
+    return convert_outputs(decoding_output_tensors);
+}
+
+template<typename T>
+T5TritonModelInstance<T>::~T5TritonModelInstance()
+{
+    freeBuffer();
+}
+
+template<typename T>
+void T5TritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
+                                              const size_t beam_width,
+                                              const size_t max_output_len,
+                                              const size_t mem_max_seq_len)
+{
+    d_output_ids_      = (int*)(allocator_->reMalloc(
+        d_output_ids_, sizeof(int) * request_batch_size * beam_width * max_output_len, false));
+    d_encoder_outputs_ = (T*)(allocator_->reMalloc(
+        d_encoder_outputs_, sizeof(T) * request_batch_size * mem_max_seq_len * t5_encoder_->getDModel(), false));
+    d_sequence_lengths_ =
+        (int*)(allocator_->reMalloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width, false));
+    d_output_log_probs_ = (float*)(allocator_->reMalloc(
+        d_output_log_probs_, sizeof(float) * request_batch_size * beam_width * max_output_len, false));
+    d_cum_log_probs_    = (float*)(allocator_->reMalloc(
+        d_cum_log_probs_, sizeof(float) * request_batch_size * beam_width * max_output_len, false));
+    d_within_range_     = (bool*)(allocator_->reMalloc(d_within_range_, sizeof(bool)));
+}
+
+template<typename T>
+void T5TritonModelInstance<T>::freeBuffer()
+{
+    allocator_->free((void**)(&d_encoder_outputs_));
+    allocator_->free((void**)(&d_output_ids_));
+    allocator_->free((void**)(&d_sequence_lengths_));
+    allocator_->free((void**)(&d_output_log_probs_));
+    allocator_->free((void**)(&d_cum_log_probs_));
+    allocator_->free((void**)(&d_within_range_));
+}
+
+template struct T5TritonModelInstance<float>;
+template struct T5TritonModelInstance<half>;
+#ifdef ENABLE_BF16
+template struct T5TritonModelInstance<__nv_bfloat16>;
+#endif
\ No newline at end of file
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h
new file mode 100644
index 000000000..26e7268b3
--- /dev/null
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/models/t5/T5Decoding.h"
+#include "src/fastertransformer/models/t5/T5Encoder.h"
+#include "src/fastertransformer/triton_backend/t5/T5TritonModel.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include <memory>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+struct T5TritonModelInstance: AbstractTransformerModelInstance {
+
+    T5TritonModelInstance(std::unique_ptr<ft::T5Encoder<T>>                       t5_encoder,
+                          std::unique_ptr<ft::T5Decoding<T>>                      t5_decoding,
+                          std::shared_ptr<ft::T5EncoderWeight<T>>                 t5_encoder_weight,
+                          std::shared_ptr<ft::T5DecodingWeight<T>>                t5_decoding_weight,
+                          std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
+                          std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map,
+                          std::unique_ptr<std::mutex>                             cublas_wrapper_mutex,
+                          std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper,
+                          std::unique_ptr<cudaDeviceProp>                         cuda_device_prop_ptr);
+    ~T5TritonModelInstance();
+
+    std::shared_ptr<std::vector<triton::Tensor>>
+    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) override
+    {
+        ft::FT_CHECK(false);
+        return nullptr;
+    };
+
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) override;
+
+    static std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    convert_outputs(ft::TensorMap& output_tensors);
+
+private:
+    const std::unique_ptr<ft::T5Encoder<T>>                       t5_encoder_;
+    const std::shared_ptr<ft::T5EncoderWeight<T>>                 t5_encoder_weight_;
+    const std::unique_ptr<ft::T5Decoding<T>>                      t5_decoding_;
+    const std::shared_ptr<ft::T5DecodingWeight<T>>                t5_decoding_weight_;
+    const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
+    const std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map_;
+    const std::unique_ptr<std::mutex>                             cublas_wrapper_mutex_;
+    const std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper_;
+    const std::unique_ptr<cudaDeviceProp>                         cuda_device_prop_ptr_;
+
+    ft::TensorMap convert_inputs(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors);
+
+    void allocateBuffer(const size_t request_batch_size,
+                        const size_t beam_width,
+                        const size_t max_output_len,
+                        const size_t mem_max_seq_len);
+    void freeBuffer();
+
+    int*   d_input_ids_                = nullptr;
+    int*   d_input_lengths_            = nullptr;
+    int*   d_input_bad_words_          = nullptr;
+    int*   d_input_stop_words_         = nullptr;
+    int*   d_input_ia3_tasks_          = nullptr;
+    int*   d_request_prompt_lengths_   = nullptr;
+    T*     d_request_prompt_embedding_ = nullptr;
+    float* d_top_p_decay_              = nullptr;
+    float* d_top_p_min_                = nullptr;
+    int*   d_top_p_reset_ids_          = nullptr;
+
+    T*     d_encoder_outputs_  = nullptr;
+    int*   d_output_ids_       = nullptr;
+    int*   d_sequence_lengths_ = nullptr;
+    float* d_output_log_probs_ = nullptr;
+    float* d_cum_log_probs_    = nullptr;
+    bool*  d_within_range_     = nullptr;
+
+    int h_total_output_len_;
+
+    std::exception_ptr h_exception_ = nullptr;
+};
diff --git a/src/fastertransformer/triton_backend/bart/CMakeLists.txt b/src/fastertransformer/triton_backend/bart/CMakeLists.txt
new file mode 100644
index 000000000..22171b5ce
--- /dev/null
+++ b/src/fastertransformer/triton_backend/bart/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+set(bart_triton_backend_files
+    T5TritonModel.cc
+    T5TritonModelInstance.cc
+)
+
+add_library(T5TritonBackend STATIC ${t5_triton_backend_files})
+set_property(TARGET T5TritonBackend PROPERTY POSITION_INDEPENDENT_CODE  ON)
+target_link_libraries(T5TritonBackend PRIVATE TransformerTritonBackend T5Encoder T5Decoding -lcublasLt)
+target_compile_features(T5TritonBackend PRIVATE cxx_std_14)

From 3ac4b389c329649950a6d015288e84f111b399af Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 14:57:21 -0700
Subject: [PATCH 002/262] commit

---
 .../bart/BartTritonModelInstance.h            | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h
index 26e7268b3..8c14901e4 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h
@@ -16,27 +16,27 @@
 
 #pragma once
 
-#include "src/fastertransformer/models/t5/T5Decoding.h"
-#include "src/fastertransformer/models/t5/T5Encoder.h"
-#include "src/fastertransformer/triton_backend/t5/T5TritonModel.h"
+#include "src/fastertransformer/models/bart/BartDecoding.h"
+#include "src/fastertransformer/models/bart/BartEncoder.h"
+#include "src/fastertransformer/triton_backend/bart/BartTritonModel.h"
 #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
 #include <memory>
 
 namespace ft = fastertransformer;
 
 template<typename T>
-struct T5TritonModelInstance: AbstractTransformerModelInstance {
+struct BartTritonModelInstance: AbstractTransformerModelInstance {
 
-    T5TritonModelInstance(std::unique_ptr<ft::T5Encoder<T>>                       t5_encoder,
-                          std::unique_ptr<ft::T5Decoding<T>>                      t5_decoding,
-                          std::shared_ptr<ft::T5EncoderWeight<T>>                 t5_encoder_weight,
-                          std::shared_ptr<ft::T5DecodingWeight<T>>                t5_decoding_weight,
+    BartTritonModelInstance(std::unique_ptr<ft::BartEncoder<T>>                       bart_encoder,
+                          std::unique_ptr<ft::BartDecoding<T>>                      bart_decoding,
+                          std::shared_ptr<ft::BartEncoderWeight<T>>                 bart_encoder_weight,
+                          std::shared_ptr<ft::BartDecodingWeight<T>>                bart_decoding_weight,
                           std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
                           std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map,
                           std::unique_ptr<std::mutex>                             cublas_wrapper_mutex,
                           std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper,
                           std::unique_ptr<cudaDeviceProp>                         cuda_device_prop_ptr);
-    ~T5TritonModelInstance();
+    ~BartTritonModelInstance();
 
     std::shared_ptr<std::vector<triton::Tensor>>
     forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) override
@@ -52,10 +52,10 @@ struct T5TritonModelInstance: AbstractTransformerModelInstance {
     convert_outputs(ft::TensorMap& output_tensors);
 
 private:
-    const std::unique_ptr<ft::T5Encoder<T>>                       t5_encoder_;
-    const std::shared_ptr<ft::T5EncoderWeight<T>>                 t5_encoder_weight_;
-    const std::unique_ptr<ft::T5Decoding<T>>                      t5_decoding_;
-    const std::shared_ptr<ft::T5DecodingWeight<T>>                t5_decoding_weight_;
+    const std::unique_ptr<ft::BartEncoder<T>>                       bart_encoder_;
+    const std::shared_ptr<ft::BartEncoderWeight<T>>                 bart_encoder_weight_;
+    const std::unique_ptr<ft::BartDecoding<T>>                      bart_decoding_;
+    const std::shared_ptr<ft::BartDecodingWeight<T>>                bart_decoding_weight_;
     const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
     const std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map_;
     const std::unique_ptr<std::mutex>                             cublas_wrapper_mutex_;

From 0e7412a2ba657c8695fcc3a2efde7d2e61f326f8 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 14:58:59 -0700
Subject: [PATCH 003/262] commit

---
 .../triton_backend/bart/CMakeLists.txt               | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/CMakeLists.txt b/src/fastertransformer/triton_backend/bart/CMakeLists.txt
index 22171b5ce..1b952dbe5 100644
--- a/src/fastertransformer/triton_backend/bart/CMakeLists.txt
+++ b/src/fastertransformer/triton_backend/bart/CMakeLists.txt
@@ -15,11 +15,11 @@
 cmake_minimum_required(VERSION 3.8)
 
 set(bart_triton_backend_files
-    T5TritonModel.cc
-    T5TritonModelInstance.cc
+    BartTritonModel.cc
+    BartTritonModelInstance.cc
 )
 
-add_library(T5TritonBackend STATIC ${t5_triton_backend_files})
-set_property(TARGET T5TritonBackend PROPERTY POSITION_INDEPENDENT_CODE  ON)
-target_link_libraries(T5TritonBackend PRIVATE TransformerTritonBackend T5Encoder T5Decoding -lcublasLt)
-target_compile_features(T5TritonBackend PRIVATE cxx_std_14)
+add_library(BartTritonBackend STATIC ${t5_triton_backend_files})
+set_property(TARGET BartTritonBackend PROPERTY POSITION_INDEPENDENT_CODE  ON)
+target_link_libraries(BartTritonBackend PRIVATE TransformerTritonBackend BartEncoder BartDecoding -lcublasLt)
+target_compile_features(BartTritonBackend PRIVATE cxx_std_14)

From da1cdedac5be0a1c690f6b842c06225c151ddad7 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 14:59:14 -0700
Subject: [PATCH 004/262] commit

---
 src/fastertransformer/triton_backend/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/CMakeLists.txt b/src/fastertransformer/triton_backend/CMakeLists.txt
index 037c36c36..c27c1bb13 100644
--- a/src/fastertransformer/triton_backend/CMakeLists.txt
+++ b/src/fastertransformer/triton_backend/CMakeLists.txt
@@ -19,6 +19,7 @@ target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils mpi_utils)
 
 add_subdirectory(gptj)
 add_subdirectory(gptneox)
+add_subdirectory(bart)
 add_subdirectory(t5)
 add_subdirectory(t5-encoder)
 add_subdirectory(multi_gpu_gpt)

From 6e0ea396ddef8dfc1499d10107107cd736da58c4 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 15:01:29 -0700
Subject: [PATCH 005/262] commit

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a164ef827..2ed27a8a2 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -322,6 +322,7 @@ add_library(transformer-shared SHARED
   $<TARGET_OBJECTS:BertLayerWeight>
   $<TARGET_OBJECTS:BertTritonBackend>
   $<TARGET_OBJECTS:BertWeight>
+  $<TARGET_OBJECTS:BartTritonBackend>
   $<TARGET_OBJECTS:Deberta>
   $<TARGET_OBJECTS:DebertaLayerWeight>
   $<TARGET_OBJECTS:DebertaTritonBackend>

From 5c97871febab10d47a5a02263414cc5527b4a285 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 15:23:57 -0700
Subject: [PATCH 006/262] commit

---
 src/fastertransformer/triton_backend/bart/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/CMakeLists.txt b/src/fastertransformer/triton_backend/bart/CMakeLists.txt
index 1b952dbe5..f37028e7f 100644
--- a/src/fastertransformer/triton_backend/bart/CMakeLists.txt
+++ b/src/fastertransformer/triton_backend/bart/CMakeLists.txt
@@ -19,7 +19,7 @@ set(bart_triton_backend_files
     BartTritonModelInstance.cc
 )
 
-add_library(BartTritonBackend STATIC ${t5_triton_backend_files})
+add_library(BartTritonBackend STATIC ${bart_triton_backend_files})
 set_property(TARGET BartTritonBackend PROPERTY POSITION_INDEPENDENT_CODE  ON)
 target_link_libraries(BartTritonBackend PRIVATE TransformerTritonBackend BartEncoder BartDecoding -lcublasLt)
 target_compile_features(BartTritonBackend PRIVATE cxx_std_14)

From 46317cd553d12de343f75bd67840d9dfa9e68729 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 16:15:40 -0700
Subject: [PATCH 007/262] commit

---
 .../triton_backend/bart/BartTritonModel.cc    | 64 +++++++++----------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index e30c11a43..058f0483e 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include "src/fastertransformer/triton_backend/t5/T5TritonModel.h"
-#include "src/fastertransformer/triton_backend/t5/T5TritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/bart/BartTritonModel.h"
+#include "src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h"
 #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
 #include "src/fastertransformer/utils/allocator.h"
 
 namespace ft = fastertransformer;
 
-std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createT5Model(std::string model_dir)
+std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createBartModel(std::string model_dir)
 {
     INIReader reader = INIReader(model_dir + "/config.ini");
     if (reader.ParseError() < 0) {
@@ -32,15 +32,15 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createT5Mode
 
     const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type");
     if (data_type == "fp16") {
-        return std::make_shared<T5TritonModel<half>>(reader, model_dir);
+        return std::make_shared<BartTritonModel<half>>(reader, model_dir);
     }
 #ifdef ENABLE_BF16
     else if (data_type == "bf16") {
-        return std::make_shared<T5TritonModel<__nv_bfloat16>>(reader, model_dir);
+        return std::make_shared<BartTritonModel<__nv_bfloat16>>(reader, model_dir);
     }
 #endif
     else if (data_type == "fp32") {
-        return std::make_shared<T5TritonModel<float>>(reader, model_dir);
+        return std::make_shared<BartTritonModel<float>>(reader, model_dir);
     }
     else {
         FT_LOG_ERROR("Unsupported data type " + data_type);
@@ -49,7 +49,7 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createT5Mode
 }
 
 template<typename T>
-T5TritonModel<T>::T5TritonModel(INIReader reader, std::string model_dir): model_dir_(model_dir)
+BartTritonModel<T>::BartTritonModel(INIReader reader, std::string model_dir): model_dir_(model_dir)
 {
     // encoder
     encoder_head_num_      = reader.GetInteger("encoder", "num_heads");
@@ -80,24 +80,24 @@ T5TritonModel<T>::T5TritonModel(INIReader reader, std::string model_dir): model_
     tensor_para_size_         = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     pipeline_para_size_       = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
     enable_custom_all_reduce_ = reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0);
-    t5_with_bias_             = reader.GetBoolean("structure", "t5_with_bias", false);
+    bart_with_bias_             = reader.GetBoolean("structure", "bart_with_bias", false);
     use_gated_activation_     = reader.GetBoolean("structure", "use_gated_activation", false);
     position_embedding_type_ =
         ft::PositionEmbeddingType(reader.Get("structure", "position_embedding_type", "relative") == "relative" ? 0 : 1);
-    q_scaling_    = t5_with_bias_ ? 1.0f : (1.0f / (sqrt(encoder_size_per_head_) * 1.0f));
+    q_scaling_    = bart_with_bias_ ? 1.0f : (1.0f / (sqrt(encoder_size_per_head_) * 1.0f));
     max_distance_ = 128;  // use default value of huggingface here
 }
 
 template<typename T>
-T5TritonModel<T>::T5TritonModel(size_t      tensor_para_size,
+BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
                                 size_t      pipeline_para_size,
                                 int         enable_custom_all_reduce,
                                 std::string model_dir,
                                 int         int8_mode):
     tensor_para_size_(tensor_para_size),
     pipeline_para_size_(pipeline_para_size),
-    encoder_shared_weights_(std::vector<std::shared_ptr<ft::T5EncoderWeight<T>>>(ft::getDeviceCount())),
-    decoding_shared_weights_(std::vector<std::shared_ptr<ft::T5DecodingWeight<T>>>(ft::getDeviceCount())),
+    encoder_shared_weights_(std::vector<std::shared_ptr<ft::BartEncoderWeight<T>>>(ft::getDeviceCount())),
+    decoding_shared_weights_(std::vector<std::shared_ptr<ft::BartDecodingWeight<T>>>(ft::getDeviceCount())),
     enable_custom_all_reduce_(enable_custom_all_reduce),
     model_dir_(model_dir),
     int8_mode_(int8_mode)
@@ -154,12 +154,12 @@ T5TritonModel<T>::T5TritonModel(size_t      tensor_para_size,
     tie_word_embeddings_ = reader.GetBoolean("decoder", "tie_word_embeddings", true);
 
     // common settings
-    t5_with_bias_         = reader.GetBoolean("structure", "t5_with_bias", false);
+    bart_with_bias_         = reader.GetBoolean("structure", "bart_with_bias", false);
     use_gated_activation_ = reader.GetBoolean("structure", "use_gated_activation", false);
     activation_type_      = ft::getActivationType(reader.Get("encoder", "feed_forward_proj"));
     position_embedding_type_ =
         ft::PositionEmbeddingType(reader.Get("structure", "position_embedding_type", "relative") == "relative" ? 0 : 1);
-    q_scaling_ = t5_with_bias_ ? 1.0f : (1.0f / (sqrt(encoder_size_per_head_) * 1.0f));
+    q_scaling_ = bart_with_bias_ ? 1.0f : (1.0f / (sqrt(encoder_size_per_head_) * 1.0f));
 
     ia3_num_tasks_ = reader.GetInteger("structure", "ia3_num_tasks", 0);
 
@@ -168,7 +168,7 @@ T5TritonModel<T>::T5TritonModel(size_t      tensor_para_size,
 
 template<typename T>
 std::unique_ptr<AbstractTransformerModelInstance>
-T5TritonModel<T>::createModelInstance(int                                                               device_id,
+BartTritonModel<T>::createModelInstance(int                                                               device_id,
                                       int                                                               rank,
                                       cudaStream_t                                                      stream,
                                       std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
@@ -212,14 +212,14 @@ T5TritonModel<T>::createModelInstance(int
     const int sm_ = ft::getSMVersion();
 
     // TODO(bhsueh) not support fused mha
-    // NOTE: fmha doesn't support t5-style relative position bias
+    // NOTE: fmha doesn't support bart-style relative position bias
     ft::AttentionType attention_type =
         ft::getAttentionType<T>(encoder_size_per_head_, sm_, true, encoder_num_bucket_or_max_pos_seq_len_, false);
 
     ft::NcclParam tensor_para_   = nccl_params.first[comms_rank];
     ft::NcclParam pipeline_para_ = nccl_params.second[comms_rank];
 
-    auto encoder = std::make_unique<ft::T5Encoder<T>>(ft::T5Encoder<T>(0,
+    auto encoder = std::make_unique<ft::BartEncoder<T>>(ft::BartEncoder<T>(0,
                                                                        0,
                                                                        encoder_head_num_,
                                                                        encoder_size_per_head_,
@@ -249,7 +249,7 @@ T5TritonModel<T>::createModelInstance(int
                                                                        enable_custom_all_reduce_,
                                                                        encoder_adapter_));
 
-    auto decoding = std::make_unique<ft::T5Decoding<T>>(ft::T5Decoding<T>(0,
+    auto decoding = std::make_unique<ft::BartDecoding<T>>(ft::BartDecoding<T>(0,
                                                                           0,
                                                                           0,
                                                                           0,
@@ -286,7 +286,7 @@ T5TritonModel<T>::createModelInstance(int
                                                                           enable_custom_all_reduce_,
                                                                           decoding_adapter_));
 
-    return std::unique_ptr<T5TritonModelInstance<T>>(new T5TritonModelInstance<T>(std::move(encoder),
+    return std::unique_ptr<BartTritonModelInstance<T>>(new BartTritonModelInstance<T>(std::move(encoder),
                                                                                   std::move(decoding),
                                                                                   encoder_shared_weights_[device_id],
                                                                                   decoding_shared_weights_[device_id],
@@ -298,14 +298,14 @@ T5TritonModel<T>::createModelInstance(int
 }
 
 template<typename T>
-void T5TritonModel<T>::createSharedWeights(int device_id, int rank)
+void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
 {
     ft::check_cuda_error(cudaSetDevice(device_id));
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;
 
     encoder_shared_weights_[device_id] =
-        std::make_shared<ft::T5EncoderWeight<T>>(encoder_head_num_,
+        std::make_shared<ft::BartEncoderWeight<T>>(encoder_head_num_,
                                                  encoder_size_per_head_,
                                                  encoder_d_model_,
                                                  encoder_inter_size_,
@@ -316,7 +316,7 @@ void T5TritonModel<T>::createSharedWeights(int device_id, int rank)
                                                  tensor_para_rank,
                                                  pipeline_para_size_,
                                                  pipeline_para_rank,
-                                                 t5_with_bias_,
+                                                 bart_with_bias_,
                                                  use_gated_activation_,
                                                  position_embedding_type_,
                                                  prompt_learning_type_,
@@ -325,7 +325,7 @@ void T5TritonModel<T>::createSharedWeights(int device_id, int rank)
                                                  encoder_adapter_.interSize());
 
     decoding_shared_weights_[device_id] =
-        std::make_shared<ft::T5DecodingWeight<T>>(decoding_head_num_,
+        std::make_shared<ft::BartDecodingWeight<T>>(decoding_head_num_,
                                                   decoding_size_per_head_,
                                                   decoding_d_model_,
                                                   decoding_inter_size_,
@@ -337,7 +337,7 @@ void T5TritonModel<T>::createSharedWeights(int device_id, int rank)
                                                   tensor_para_rank,
                                                   pipeline_para_size_,
                                                   pipeline_para_rank,
-                                                  t5_with_bias_,
+                                                  bart_with_bias_,
                                                   use_gated_activation_,
                                                   position_embedding_type_,
                                                   ia3_num_tasks_,
@@ -348,7 +348,7 @@ void T5TritonModel<T>::createSharedWeights(int device_id, int rank)
 }
 
 template<typename T>
-std::string T5TritonModel<T>::toString()
+std::string BartTritonModel<T>::toString()
 {
     std::stringstream ss;
     std::string       position_embedding_type_string =
@@ -365,7 +365,7 @@ std::string T5TritonModel<T>::toString()
        << "\n    decoding_d_model_: " << decoding_d_model_ << "\n    decoding_inter_size_: " << decoding_inter_size_
        << "\n    decoding_num_layer_: " << decoding_num_layer_ << "\n    decoding_vocab_size_: " << decoding_vocab_size_
        << "\n    decoding_num_bucket_or_max_pos_seq_len_: " << decoding_num_bucket_or_max_pos_seq_len_
-       << "\n    decoding_adapter: " << decoding_adapter_.toString() << "\n    t5_with_bias_: " << t5_with_bias_
+       << "\n    decoding_adapter: " << decoding_adapter_.toString() << "\n    bart_with_bias_: " << bart_with_bias_
        << "\n    use_gated_activation_: " << use_gated_activation_
        << "\n   position_embedding_type_: " << position_embedding_type_string << "\n    start_id_: " << start_id_
        << "\n    end_id_: " << end_id_ << "\n    model_name_: " << model_name_ << "\n    model_dir_: " << model_dir_
@@ -375,7 +375,7 @@ std::string T5TritonModel<T>::toString()
 }
 
 template<typename T>
-void T5TritonModel<T>::createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
+void BartTritonModel<T>::createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
                                          int                                                   world_size)
 {
     using commDataType = typename ft::CustomARCommTypeConverter<T>::Type;
@@ -383,19 +383,19 @@ void T5TritonModel<T>::createCustomComms(std::vector<std::shared_ptr<ft::Abstrac
 }
 
 template<typename T>
-int T5TritonModel<T>::getTensorParaSize()
+int BartTritonModel<T>::getTensorParaSize()
 {
     return tensor_para_size_;
 }
 
 template<typename T>
-int T5TritonModel<T>::getPipelineParaSize()
+int BartTritonModel<T>::getPipelineParaSize()
 {
     return pipeline_para_size_;
 }
 
-template struct T5TritonModel<float>;
-template struct T5TritonModel<half>;
+template struct BartTritonModel<float>;
+template struct BartTritonModel<half>;
 #ifdef ENABLE_BF16
-template struct T5TritonModel<__nv_bfloat16>;
+template struct BartTritonModel<__nv_bfloat16>;
 #endif

From c732029841882627868f07a9ced8f903ead7c716 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 16:16:56 -0700
Subject: [PATCH 008/262] commit

---
 .../triton_backend/transformer_triton_backend.hpp                | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/transformer_triton_backend.hpp b/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
index edffabfd7..3b9ef2d08 100644
--- a/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
+++ b/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
@@ -294,6 +294,7 @@ struct AbstractTransformerModel {
     static std::shared_ptr<AbstractTransformerModel> createT5Model(std::string model_dir);
     static std::shared_ptr<AbstractTransformerModel> createT5EncoderModel(std::string model_dir);
     static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir);
+    static std::shared_ptr<AbstractTransformerModel> createBartModel(std::string model_dir);
 
     std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
     createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false);

From a81eeaa2a0ca5f76e1ed80b0fbe0deea30007410 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 16:28:45 -0700
Subject: [PATCH 009/262] commit

---
 .../triton_backend/bart/BartTritonModel.cc          | 13 -------------
 .../triton_backend/bart/BartTritonModel.h           |  8 --------
 2 files changed, 21 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 058f0483e..40231f40a 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -124,19 +124,6 @@ BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
     encoder_adapter_.interSize(reader.GetInteger("encoder", "adapter_inter_size", 0));
     encoder_adapter_.layerNormType(reader.Get("encoder", "adapter_norm_position", "pre"));
 
-    // encoder prompt
-    num_tasks_                = reader.GetInteger("encoder", "num_tasks", 0);
-    prompt_learning_start_id_ = reader.GetInteger("encoder", "prompt_learning_start_id", encoder_vocab_size_ + 1);
-    prompt_learning_type_ =
-        static_cast<ft::PromptLearningType>(reader.GetInteger("encoder", "prompt_learning_type", 0));
-
-    for (int task_name_id = 0; task_name_id < num_tasks_; task_name_id++) {
-        std::string config_task_name = "task_" + std::to_string(task_name_id);
-        std::string task_name        = reader.Get(config_task_name, "task_name");
-        const int   prompt_length    = reader.GetInteger(config_task_name, "prompt_length", 0);
-        prompt_learning_table_pair_.insert({task_name, {task_name_id, prompt_length}});
-    }
-
     // decoding
     decoding_head_num_      = reader.GetInteger("decoder", "num_heads");
     decoding_size_per_head_ = reader.GetInteger("decoder", "d_kv");
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.h b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
index 1ffe0a407..2675af20d 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.h
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
@@ -64,13 +64,6 @@ struct BartTritonModel: public AbstractTransformerModel {
     size_t                  encoder_num_layer_;
     size_t                  encoder_vocab_size_;
     size_t                  encoder_num_bucket_or_max_pos_seq_len_;
-    ft::LinearAdapterConfig encoder_adapter_{};
-
-    // prompt for encoder
-    size_t                                     num_tasks_                  = 0;
-    int                                        prompt_learning_start_id_   = 0;
-    ft::PromptLearningType                     prompt_learning_type_       = ft::PromptLearningType::no_prompt;
-    std::map<std::string, std::pair<int, int>> prompt_learning_table_pair_ = {};
 
     // decoding
     size_t                  decoding_head_num_;
@@ -80,7 +73,6 @@ struct BartTritonModel: public AbstractTransformerModel {
     size_t                  decoding_num_layer_;
     size_t                  decoding_vocab_size_;
     size_t                  decoding_num_bucket_or_max_pos_seq_len_;
-    ft::LinearAdapterConfig decoding_adapter_{};
 
     float  q_scaling_;
     size_t ia3_num_tasks_;

From 6b72baa89c022934dd64127d678c2bb73039b886 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 16:41:49 -0700
Subject: [PATCH 010/262] commit

---
 .../triton_backend/bart/BartTritonModel.cc         | 14 ++++----------
 .../triton_backend/bart/BartTritonModel.h          |  2 +-
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 40231f40a..72a4817b2 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -148,8 +148,6 @@ BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
         ft::PositionEmbeddingType(reader.Get("structure", "position_embedding_type", "relative") == "relative" ? 0 : 1);
     q_scaling_ = bart_with_bias_ ? 1.0f : (1.0f / (sqrt(encoder_size_per_head_) * 1.0f));
 
-    ia3_num_tasks_ = reader.GetInteger("structure", "ia3_num_tasks", 0);
-
     max_distance_ = 128;  // use default value of huggingface here
 }
 
@@ -304,12 +302,9 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                  pipeline_para_size_,
                                                  pipeline_para_rank,
                                                  bart_with_bias_,
+                                                 mbart_para_,
                                                  use_gated_activation_,
-                                                 position_embedding_type_,
-                                                 prompt_learning_type_,
-                                                 prompt_learning_table_pair_,
-                                                 ia3_num_tasks_,
-                                                 encoder_adapter_.interSize());
+                                                 position_embedding_type_);
 
     decoding_shared_weights_[device_id] =
         std::make_shared<ft::BartDecodingWeight<T>>(decoding_head_num_,
@@ -325,10 +320,9 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                   pipeline_para_size_,
                                                   pipeline_para_rank,
                                                   bart_with_bias_,
+                                                  mbart_para_,
                                                   use_gated_activation_,
-                                                  position_embedding_type_,
-                                                  ia3_num_tasks_,
-                                                  decoding_adapter_.interSize());
+                                                  position_embedding_type_,);
 
     encoder_shared_weights_[device_id]->loadModel(model_dir_);
     decoding_shared_weights_[device_id]->loadModel(model_dir_);
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.h b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
index 2675af20d..4c9f19cfc 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.h
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
@@ -75,7 +75,6 @@ struct BartTritonModel: public AbstractTransformerModel {
     size_t                  decoding_num_bucket_or_max_pos_seq_len_;
 
     float  q_scaling_;
-    size_t ia3_num_tasks_;
 
     size_t max_distance_;
     int    start_id_;
@@ -92,6 +91,7 @@ struct BartTritonModel: public AbstractTransformerModel {
 
     // bart structure difference
     bool                      bart_with_bias_;
+    bool                      mbart_para_ = false;
     bool                      use_gated_activation_;
     ft::PositionEmbeddingType position_embedding_type_;
     ft::ActivationType        activation_type_;

From 6580def441ffdf31f0908c403184a011a2f4d7ab Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 16:43:36 -0700
Subject: [PATCH 011/262] commit

---
 .../triton_backend/bart/BartTritonModel.cc               | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 72a4817b2..03884fc62 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -60,8 +60,6 @@ BartTritonModel<T>::BartTritonModel(INIReader reader, std::string model_dir): mo
     encoder_vocab_size_    = reader.GetInteger("encoder", "vocab_size");
     encoder_num_bucket_or_max_pos_seq_len_ =
         reader.GetInteger("encoder", "relative_attention_num_buckets_or_max_pos_seq_len");
-    encoder_adapter_.interSize(reader.GetInteger("encoder", "adapter_inter_size", 0));
-    encoder_adapter_.layerNormType(reader.Get("encoder", "adapter_norm_position", "pre"));
 
     // decoding
     decoding_head_num_      = reader.GetInteger("decoder", "num_heads");
@@ -121,8 +119,6 @@ BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
     encoder_vocab_size_    = reader.GetInteger("encoder", "vocab_size");
     encoder_num_bucket_or_max_pos_seq_len_ =
         reader.GetInteger("encoder", "relative_attention_num_buckets_or_max_pos_seq_len");
-    encoder_adapter_.interSize(reader.GetInteger("encoder", "adapter_inter_size", 0));
-    encoder_adapter_.layerNormType(reader.Get("encoder", "adapter_norm_position", "pre"));
 
     // decoding
     decoding_head_num_      = reader.GetInteger("decoder", "num_heads");
@@ -133,8 +129,6 @@ BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
     decoding_vocab_size_    = reader.GetInteger("decoder", "vocab_size");
     decoding_num_bucket_or_max_pos_seq_len_ =
         reader.GetInteger("decoder", "relative_attention_num_buckets_or_max_pos_seq_len");
-    decoding_adapter_.interSize(reader.GetInteger("decoder", "adapter_inter_size", 0));
-    decoding_adapter_.layerNormType(reader.Get("decoder", "adapter_norm_position", "pre"));
 
     start_id_            = reader.GetInteger("decoder", "decoder_start_token_id");
     end_id_              = reader.GetInteger("decoder", "eos_token_id");
@@ -340,13 +334,12 @@ std::string BartTritonModel<T>::toString()
        << "\n    encoder_d_model_: " << encoder_d_model_ << "\n    encoder_inter_size_: " << encoder_inter_size_
        << "\n    encoder_num_layer_: " << encoder_num_layer_ << "\n    encoder_vocab_size_: " << encoder_vocab_size_
        << "\n    encoder_num_bucket_or_max_pos_seq_len_: " << encoder_num_bucket_or_max_pos_seq_len_
-       << "\n    encoder_adapter_: " << encoder_adapter_.toString()
        << "\n    decoding_head_num_: " << decoding_head_num_
        << "\n    decoding_size_per_head_: " << decoding_size_per_head_
        << "\n    decoding_d_model_: " << decoding_d_model_ << "\n    decoding_inter_size_: " << decoding_inter_size_
        << "\n    decoding_num_layer_: " << decoding_num_layer_ << "\n    decoding_vocab_size_: " << decoding_vocab_size_
        << "\n    decoding_num_bucket_or_max_pos_seq_len_: " << decoding_num_bucket_or_max_pos_seq_len_
-       << "\n    decoding_adapter: " << decoding_adapter_.toString() << "\n    bart_with_bias_: " << bart_with_bias_
+       << "\n    bart_with_bias_: " << bart_with_bias_
        << "\n    use_gated_activation_: " << use_gated_activation_
        << "\n   position_embedding_type_: " << position_embedding_type_string << "\n    start_id_: " << start_id_
        << "\n    end_id_: " << end_id_ << "\n    model_name_: " << model_name_ << "\n    model_dir_: " << model_dir_

From b961c97834b3bacd45cd4e84c317c92586a04a20 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 16:50:51 -0700
Subject: [PATCH 012/262] commit

---
 .../triton_backend/bart/BartTritonModel.cc      | 17 ++++-------------
 .../triton_backend/bart/BartTritonModel.h       |  1 +
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 03884fc62..3318e94a0 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -206,12 +206,9 @@ BartTritonModel<T>::createModelInstance(int
                                                                        encoder_d_model_,
                                                                        encoder_num_layer_,
                                                                        encoder_num_bucket_or_max_pos_seq_len_,
-                                                                       0,  // expert_num
                                                                        max_distance_,
-                                                                       0,  // moe_k
                                                                        sm_,
                                                                        q_scaling_,
-                                                                       {},  // moe_layer_index
                                                                        stream,
                                                                        cublas_wrapper.get(),
                                                                        allocator.get(),
@@ -219,14 +216,11 @@ BartTritonModel<T>::createModelInstance(int
                                                                        attention_type,
                                                                        false,
                                                                        activation_type_,
-                                                                       ft::LayerNormType::pre_layernorm,
+                                                                       layernorm_type_,
                                                                        tensor_para_,
                                                                        pipeline_para_,
-                                                                       prompt_learning_start_id_,
-                                                                       prompt_learning_type_,
                                                                        custom_all_reduce_comm,
-                                                                       enable_custom_all_reduce_,
-                                                                       encoder_adapter_));
+                                                                       enable_custom_all_reduce_));
 
     auto decoding = std::make_unique<ft::BartDecoding<T>>(ft::BartDecoding<T>(0,
                                                                           0,
@@ -239,9 +233,7 @@ BartTritonModel<T>::createModelInstance(int
                                                                           decoding_num_layer_,
                                                                           decoding_vocab_size_,
                                                                           decoding_num_bucket_or_max_pos_seq_len_,
-                                                                          0,  // expert_num
                                                                           max_distance_,
-                                                                          0,  // moe_k
                                                                           q_scaling_,
                                                                           start_id_,
                                                                           end_id_,
@@ -251,7 +243,6 @@ BartTritonModel<T>::createModelInstance(int
                                                                           1.0f,  // temperature_,
                                                                           0.0f,  // len_penalty_,
                                                                           1.0f,  // repetition_penalty_,
-                                                                          {},    // moe_layer_index
                                                                           stream,
                                                                           cublas_wrapper.get(),
                                                                           allocator.get(),
@@ -260,10 +251,10 @@ BartTritonModel<T>::createModelInstance(int
                                                                           tensor_para_,
                                                                           pipeline_para_,
                                                                           activation_type_,
+                                                                          layernorm_type_,
                                                                           tie_word_embeddings_,
                                                                           custom_all_reduce_comm,
-                                                                          enable_custom_all_reduce_,
-                                                                          decoding_adapter_));
+                                                                          enable_custom_all_reduce_));
 
     return std::unique_ptr<BartTritonModelInstance<T>>(new BartTritonModelInstance<T>(std::move(encoder),
                                                                                   std::move(decoding),
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.h b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
index 4c9f19cfc..01cbd3647 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.h
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
@@ -95,6 +95,7 @@ struct BartTritonModel: public AbstractTransformerModel {
     bool                      use_gated_activation_;
     ft::PositionEmbeddingType position_embedding_type_;
     ft::ActivationType        activation_type_;
+    ft::LayerNormType         layernorm_type_;
 
     bool is_fp16_;
     int  int8_mode_;

From 276c8f4b9e31f040e6cc9399988138e7e05ef346 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 16:55:15 -0700
Subject: [PATCH 013/262] commit

---
 src/fastertransformer/triton_backend/bart/BartTritonModel.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 3318e94a0..9e592dddc 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -70,8 +70,6 @@ BartTritonModel<T>::BartTritonModel(INIReader reader, std::string model_dir): mo
     decoding_vocab_size_    = reader.GetInteger("decoder", "vocab_size");
     decoding_num_bucket_or_max_pos_seq_len_ =
         reader.GetInteger("decoder", "relative_attention_num_buckets_or_max_pos_seq_len");
-    decoding_adapter_.interSize(reader.GetInteger("decoder", "adapter_inter_size", 0));
-    decoding_adapter_.layerNormType(reader.Get("decoder", "adapter_norm_position", "pre"));
 
     start_id_                 = reader.GetInteger("decoder", "decoder_start_token_id");
     end_id_                   = reader.GetInteger("decoder", "eos_token_id");
@@ -307,7 +305,7 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                   bart_with_bias_,
                                                   mbart_para_,
                                                   use_gated_activation_,
-                                                  position_embedding_type_,);
+                                                  position_embedding_type_);
 
     encoder_shared_weights_[device_id]->loadModel(model_dir_);
     decoding_shared_weights_[device_id]->loadModel(model_dir_);

From d40b6a80806340bfa3bbe805834fae34588d7c13 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 17:27:20 -0700
Subject: [PATCH 014/262] commit

---
 .../bart/BartTritonModelInstance.cc           | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 05e81e253..069312391 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "src/fastertransformer/triton_backend/t5/T5TritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h"
 #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
 #include "src/fastertransformer/triton_backend/triton_utils.hpp"
 #include "src/fastertransformer/utils/Tensor.h"
@@ -25,26 +25,26 @@ namespace ft = fastertransformer;
 template<typename T>
 void triton_stream_callback(ft::TensorMap* output_tensors, void* ctx)
 {
-    auto* const model  = reinterpret_cast<T5TritonModelInstance<T>*>(ctx);
-    auto const  result = T5TritonModelInstance<T>::convert_outputs(*output_tensors);
+    auto* const model  = reinterpret_cast<BartTritonModelInstance<T>*>(ctx);
+    auto const  result = BartTritonModelInstance<T>::convert_outputs(*output_tensors);
 
     model->stream_cb_(result, model->stream_ctx_);
 }
 
 template<typename T>
-T5TritonModelInstance<T>::T5TritonModelInstance(std::unique_ptr<ft::T5Encoder<T>>        t5_encoder,
-                                                std::unique_ptr<ft::T5Decoding<T>>       t5_decoding,
-                                                std::shared_ptr<ft::T5EncoderWeight<T>>  t5_encoder_weight,
-                                                std::shared_ptr<ft::T5DecodingWeight<T>> t5_decoding_weight,
+BartTritonModelInstance<T>::BartTritonModelInstance(std::unique_ptr<ft::BartEncoder<T>>        bart_encoder,
+                                                std::unique_ptr<ft::BartDecoding<T>>       bart_decoding,
+                                                std::shared_ptr<ft::BartEncoderWeight<T>>  bart_encoder_weight,
+                                                std::shared_ptr<ft::BartDecodingWeight<T>> bart_decoding_weight,
                                                 std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
                                                 std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map,
                                                 std::unique_ptr<std::mutex>          cublas_wrapper_mutex,
                                                 std::unique_ptr<ft::cublasMMWrapper> cublas_wrapper,
                                                 std::unique_ptr<cudaDeviceProp>      cuda_device_prop_ptr):
-    t5_encoder_(std::move(t5_encoder)),
-    t5_decoding_(std::move(t5_decoding)),
-    t5_encoder_weight_(t5_encoder_weight),
-    t5_decoding_weight_(t5_decoding_weight),
+    bart_encoder_(std::move(bart_encoder)),
+    bart_decoding_(std::move(bart_decoding)),
+    bart_encoder_weight_(bart_encoder_weight),
+    bart_decoding_weight_(bart_decoding_weight),
     allocator_(std::move(allocator)),
     cublas_algo_map_(std::move(cublas_algo_map)),
     cublas_wrapper_mutex_(std::move(cublas_wrapper_mutex)),
@@ -55,7 +55,7 @@ T5TritonModelInstance<T>::T5TritonModelInstance(std::unique_ptr<ft::T5Encoder<T>
 
 template<typename T>
 ft::TensorMap
-T5TritonModelInstance<T>::convert_inputs(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
+BartTritonModelInstance<T>::convert_inputs(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
 {
     move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
     move_tensor_H2D(input_tensors->at("sequence_length"), d_input_lengths_, &allocator_);
@@ -88,7 +88,7 @@ T5TritonModelInstance<T>::convert_inputs(std::shared_ptr<std::unordered_map<std:
 
 template<typename T>
 std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-T5TritonModelInstance<T>::convert_outputs(ft::TensorMap& output_tensors)
+BartTritonModelInstance<T>::convert_outputs(ft::TensorMap& output_tensors)
 {
     std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
         new std::unordered_map<std::string, triton::Tensor>();
@@ -102,7 +102,7 @@ T5TritonModelInstance<T>::convert_outputs(ft::TensorMap& output_tensors)
 
 template<typename T>
 std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-T5TritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
+BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
 {
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];
@@ -123,7 +123,7 @@ T5TritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string
         {{"output_hidden_state",
           ft::Tensor{ft::MEMORY_GPU,
                      ft::getTensorType<T>(),
-                     std::vector<size_t>{request_batch_size, mem_max_seq_len, t5_encoder_->getDModel()},
+                     std::vector<size_t>{request_batch_size, mem_max_seq_len, bart_encoder_->getDModel()},
                      d_encoder_outputs_}}});
 
     ft::TensorMap decoding_input_tensors({{"encoder_output", encoder_output_tensors.at("output_hidden_state")},
@@ -195,10 +195,10 @@ T5TritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string
     }
 
     if (has_ia3_tasks) {
-        const auto num_ia3_tasks = t5_encoder_weight_->getNumIA3Tasks();
+        const auto num_ia3_tasks = bart_encoder_weight_->getNumIA3Tasks();
         FT_CHECK_WITH_INFO(num_ia3_tasks > 0, "Cannot request ia3_tasks, model has no IA3 adapters");
         const bool is_within_range = ft::invokeCheckRange<int>(
-            d_input_ia3_tasks_, request_batch_size, 0, num_ia3_tasks - 1, d_within_range_, t5_encoder_->getStream());
+            d_input_ia3_tasks_, request_batch_size, 0, num_ia3_tasks - 1, d_within_range_, bart_encoder_->getStream());
         FT_CHECK_WITH_INFO(is_within_range,
                            ft::fmtstr("Requested IA3 tasks aren't in the range [0, %d).", num_ia3_tasks));
 
@@ -207,14 +207,14 @@ T5TritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string
 
     try {
         if (stream_cb_ != nullptr) {
-            t5_decoding_->registerCallback(triton_stream_callback<T>, this);
+            bart_decoding_->registerCallback(triton_stream_callback<T>, this);
         }
 
-        t5_encoder_->forward(&encoder_output_tensors, &encoder_input_tensors, t5_encoder_weight_.get());
-        t5_decoding_->forward(&decoding_output_tensors, &decoding_input_tensors, t5_decoding_weight_.get());
+        bart_encoder_->forward(&encoder_output_tensors, &encoder_input_tensors, bart_encoder_weight_.get());
+        bart_decoding_->forward(&decoding_output_tensors, &decoding_input_tensors, bart_decoding_weight_.get());
 
         if (stream_cb_ != nullptr) {
-            t5_decoding_->unRegisterCallback();
+            bart_decoding_->unRegisterCallback();
         }
     }
     catch (...) {
@@ -227,13 +227,13 @@ T5TritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string
 }
 
 template<typename T>
-T5TritonModelInstance<T>::~T5TritonModelInstance()
+BartTritonModelInstance<T>::~BartTritonModelInstance()
 {
     freeBuffer();
 }
 
 template<typename T>
-void T5TritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
+void BartTritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
                                               const size_t beam_width,
                                               const size_t max_output_len,
                                               const size_t mem_max_seq_len)
@@ -241,7 +241,7 @@ void T5TritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
     d_output_ids_      = (int*)(allocator_->reMalloc(
         d_output_ids_, sizeof(int) * request_batch_size * beam_width * max_output_len, false));
     d_encoder_outputs_ = (T*)(allocator_->reMalloc(
-        d_encoder_outputs_, sizeof(T) * request_batch_size * mem_max_seq_len * t5_encoder_->getDModel(), false));
+        d_encoder_outputs_, sizeof(T) * request_batch_size * mem_max_seq_len * bart_encoder_->getDModel(), false));
     d_sequence_lengths_ =
         (int*)(allocator_->reMalloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width, false));
     d_output_log_probs_ = (float*)(allocator_->reMalloc(
@@ -252,7 +252,7 @@ void T5TritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
 }
 
 template<typename T>
-void T5TritonModelInstance<T>::freeBuffer()
+void BartTritonModelInstance<T>::freeBuffer()
 {
     allocator_->free((void**)(&d_encoder_outputs_));
     allocator_->free((void**)(&d_output_ids_));
@@ -262,8 +262,8 @@ void T5TritonModelInstance<T>::freeBuffer()
     allocator_->free((void**)(&d_within_range_));
 }
 
-template struct T5TritonModelInstance<float>;
-template struct T5TritonModelInstance<half>;
+template struct BartTritonModelInstance<float>;
+template struct BartTritonModelInstance<half>;
 #ifdef ENABLE_BF16
-template struct T5TritonModelInstance<__nv_bfloat16>;
-#endif
\ No newline at end of file
+template struct BartTritonModelInstance<__nv_bfloat16>;
+#endif

From 0286429e244756d55f1a46d619c980fa5a9bece4 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 17:28:53 -0700
Subject: [PATCH 015/262] commit

---
 .../bart/BartTritonModelInstance.cc           | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 069312391..efaf99387 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -80,9 +80,6 @@ BartTritonModelInstance<T>::convert_inputs(std::shared_ptr<std::unordered_map<st
             {"request_prompt_embedding",
              as_GPU_tensor(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_)});
     }
-    if (input_tensors->count("ia3_tasks")) {
-        ft_input_tensors.insert({"ia3_tasks", as_GPU_tensor(input_tensors->at("ia3_tasks"), d_input_ia3_tasks_)});
-    }
     return ft_input_tensors;
 }
 
@@ -109,14 +106,9 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
     const size_t max_output_len     = *((uint*)input_tensors->at("max_output_len").data);
     const size_t beam_width =
         input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1;
-    const bool has_ia3_tasks = input_tensors->count("ia3_tasks");
 
     allocateBuffer(request_batch_size, beam_width, max_output_len, mem_max_seq_len);
 
-    if (has_ia3_tasks) {
-        move_tensor_H2D(input_tensors->at("ia3_tasks"), d_input_ia3_tasks_, &allocator_);
-    }
-
     ft::TensorMap encoder_input_tensors(convert_inputs(input_tensors));
 
     ft::TensorMap encoder_output_tensors(
@@ -147,7 +139,6 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
                                          "sequence_length",
                                          "bad_words_list",
                                          "stop_words_list",
-                                         "ia3_tasks",
                                          "top_p_decay",
                                          "top_p_min",
                                          "top_p_reset_ids"};
@@ -194,17 +185,6 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
                                                    d_cum_log_probs_}});
     }
 
-    if (has_ia3_tasks) {
-        const auto num_ia3_tasks = bart_encoder_weight_->getNumIA3Tasks();
-        FT_CHECK_WITH_INFO(num_ia3_tasks > 0, "Cannot request ia3_tasks, model has no IA3 adapters");
-        const bool is_within_range = ft::invokeCheckRange<int>(
-            d_input_ia3_tasks_, request_batch_size, 0, num_ia3_tasks - 1, d_within_range_, bart_encoder_->getStream());
-        FT_CHECK_WITH_INFO(is_within_range,
-                           ft::fmtstr("Requested IA3 tasks aren't in the range [0, %d).", num_ia3_tasks));
-
-        decoding_input_tensors.insert({"ia3_tasks", as_GPU_tensor(input_tensors->at("ia3_tasks"), d_input_ia3_tasks_)});
-    }
-
     try {
         if (stream_cb_ != nullptr) {
             bart_decoding_->registerCallback(triton_stream_callback<T>, this);

From f4501880b36cfe412461eed0cefb3e770abc4aab Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 17:35:12 -0700
Subject: [PATCH 016/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 11 +++++++++++
 src/fastertransformer/models/bart/BartDecoding.h  |  3 +++
 2 files changed, 14 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index a0e2d876e..6286f7b26 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -321,6 +321,17 @@ BartDecoding<T>::~BartDecoding()
     freeBuffer();
 }
 
+template<typename T>
+void BartDecoding<T>::registerCallback(callback_sig* fn, void* ctx)
+{
+}
+
+template<typename T>
+void BartDecoding<T>::unRegisterCallback()
+{
+}
+
+
 template<typename T>
 void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                               TensorMap*                   input_tensors,
diff --git a/src/fastertransformer/models/bart/BartDecoding.h b/src/fastertransformer/models/bart/BartDecoding.h
index 09ec9c2fa..5d442ddc5 100644
--- a/src/fastertransformer/models/bart/BartDecoding.h
+++ b/src/fastertransformer/models/bart/BartDecoding.h
@@ -170,6 +170,9 @@ class BartDecoding: public BaseLayer {
     void forward(TensorMap* output_tensors, TensorMap* input_tensors, const BartDecodingWeight<T>* Decoding_weights);
 
     void setStream(cudaStream_t stream) override;
+
+    void registerCallback(callback_sig* fn, void* ctx);
+    void unRegisterCallback();
 };
 
 }  // namespace fastertransformer

From 2365cdb240dc0212261b2286bcb1d6e9fe9b9596 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 17:36:23 -0700
Subject: [PATCH 017/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartDecoding.h b/src/fastertransformer/models/bart/BartDecoding.h
index 5d442ddc5..50bf6edd3 100644
--- a/src/fastertransformer/models/bart/BartDecoding.h
+++ b/src/fastertransformer/models/bart/BartDecoding.h
@@ -128,6 +128,8 @@ class BartDecoding: public BaseLayer {
     const bool     using_beam_hyps = true;
     BeamHypotheses beam_hyps_;
 
+    using callback_sig                 = void(TensorMap*, void*);
+
 public:
     BartDecoding(size_t                              max_batch_size,
                  size_t                              max_seq_len,

From 1543b006f617da05f7a8c2f6ed1e475a7c89adae Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 20:37:28 -0700
Subject: [PATCH 018/262] commit

---
 src/fastertransformer/triton_backend/bart/BartTritonModel.cc | 4 +---
 src/fastertransformer/triton_backend/bart/BartTritonModel.h  | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 9e592dddc..334d9eb5f 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -76,10 +76,8 @@ BartTritonModel<T>::BartTritonModel(INIReader reader, std::string model_dir): mo
     tensor_para_size_         = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     pipeline_para_size_       = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
     enable_custom_all_reduce_ = reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0);
-    bart_with_bias_             = reader.GetBoolean("structure", "bart_with_bias", false);
+    bart_with_bias_           = reader.GetBoolean("structure", "bart_with_bias", true);
     use_gated_activation_     = reader.GetBoolean("structure", "use_gated_activation", false);
-    position_embedding_type_ =
-        ft::PositionEmbeddingType(reader.Get("structure", "position_embedding_type", "relative") == "relative" ? 0 : 1);
     q_scaling_    = bart_with_bias_ ? 1.0f : (1.0f / (sqrt(encoder_size_per_head_) * 1.0f));
     max_distance_ = 128;  // use default value of huggingface here
 }
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.h b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
index 01cbd3647..135f084f2 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.h
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
@@ -93,9 +93,9 @@ struct BartTritonModel: public AbstractTransformerModel {
     bool                      bart_with_bias_;
     bool                      mbart_para_ = false;
     bool                      use_gated_activation_;
-    ft::PositionEmbeddingType position_embedding_type_;
+    ft::PositionEmbeddingType position_embedding_type_ = ft::PositionEmbeddingType::absolute;
     ft::ActivationType        activation_type_;
-    ft::LayerNormType         layernorm_type_;
+    ft::LayerNormType         layernorm_type_ = ft::LayerNormType::post_layernorm;
 
     bool is_fp16_;
     int  int8_mode_;

From a26793a0ea7930143aeb24fe55d506497e01b05b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 20:37:52 -0700
Subject: [PATCH 019/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 227 ++++++++++++++++++
 1 file changed, 227 insertions(+)
 create mode 100644 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
new file mode 100644
index 000000000..6763c940d
--- /dev/null
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import configparser
+import multiprocessing
+from datetime import datetime
+import logging
+from pathlib import Path
+
+import sys
+import os
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(dir_path + "/../../../../3rdparty/transformers/src/")
+
+from transformers import BartForConditionalGeneration, BartEncoderModel
+
+import numpy as np
+import torch  # pytype: disable=import-error
+
+LOGGER = logging.getLogger(__name__)
+
+rename_mapping = {"relative_attention_num_buckets": "relative_attention_num_buckets_or_max_pos_seq_len"}
+new_configs = {
+    "structure": {"t5_with_bias": "false", "use_gated_activation": "false", "position_embedding_type": "relative"}}
+
+
+def get_weight_data_type(data_type):
+    if data_type == "fp32":
+        return np.float32
+    elif data_type == "fp16":
+        return np.float16
+    else:
+        assert False, f"Invalid weight data type {data_type}"
+
+
+def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
+    model_dict = {}
+    for name, param in model.named_parameters():
+        if name.find("decoder") != -1 and name.find("SelfAttention") != -1:
+            model_dict[name] = param
+
+    for i in range(model.decoder.config.num_layers):
+        shape = model_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"].T.shape
+        qkv = torch.cat([model_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"].T,
+                         model_dict[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"].T,
+                         model_dict[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"].T], dim=-1)
+
+        qkv = qkv.reshape([shape[0], 3, shape[1]])
+        qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
+
+        split_vals = np.split(qkv, factor, axis=-1)
+        for j in range(factor):
+            saved_path = saved_dir / f"decoder.block.{i}.layer.0.SelfAttention.qkv.weight.{j}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+
+
+def split_and_convert_process(key, val, factor, saved_dir):
+    if val.ndim == 2:
+        val = val.transpose(1, 0)
+    saved_key = key
+    LOGGER.debug(f"key: {key}, val.shape: {val.shape}")
+
+    if key.find("shared.weight") != -1:
+        # shared weights, only need to convert the weights of rank 0
+        saved_path = saved_dir / f"{saved_key}.bin"
+        val.tofile(saved_path.as_posix())
+
+        saved_path = saved_dir / f"{saved_key}_T.bin"
+        val.T.tofile(saved_path.as_posix())
+    elif key.find("lm_head.weight") != -1:
+        # lm_head weights, only need to convert the weights of rank 0
+        val = val.transpose(1, 0)  # For lm_head, we use TN gemm to compute, so we don't need to transpose
+        saved_path = saved_dir / f"{saved_key}.bin"
+        val.tofile(saved_path.as_posix())
+
+    elif key.find("layer_norm.weight") != -1:
+        # shared weights, only need to convert the weights of rank 0
+        saved_path = saved_dir / f"{saved_key}.bin"
+        val.tofile(saved_path.as_posix())
+
+    elif (
+            key.find("SelfAttention.o.weight") != -1
+            or key.find("EncDecAttention.o.weight") != -1
+            or key.find("DenseReluDense.wo.weight") != -1
+    ):
+        split_vals = np.split(val, factor, axis=0)
+        for j in range(factor):
+            saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+
+    elif (
+            key.find("DenseReluDense.wi.weight") != -1
+            or (key.find("encoder") != -1 and (
+            key.find("SelfAttention.q.weight") != -1
+            or key.find("SelfAttention.k.weight") != -1
+            or key.find("SelfAttention.v.weight") != -1
+    )
+            )
+            or key.find("EncDecAttention.q.weight") != -1
+            or key.find("EncDecAttention.k.weight") != -1
+            or key.find("EncDecAttention.v.weight") != -1
+    ):
+        split_vals = np.split(val, factor, axis=-1)
+        for j in range(factor):
+            saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif (
+            key.find("DenseReluDense.wi_0.weight") != -1
+            or key.find("DenseReluDense.wi_1.weight") != -1
+    ):
+        # For gated activation.
+        if key.find("DenseReluDense.wi_0.weight") != -1:
+            saved_key = key.replace("wi_0", "wi")
+        elif key.find("DenseReluDense.wi_1.weight") != -1:
+            saved_key = key.replace("wi_1", "wi2")
+        split_vals = np.split(val, factor, axis=-1)
+        for j in range(factor):
+            saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif key.find("relative_attention_bias") != -1:
+        split_vals = np.split(val, factor, axis=0)
+        for j in range(factor):
+            saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif (
+            key.find("decoder") != -1 and
+            (
+                    key.find("SelfAttention.q.weight") != -1
+                    or key.find("SelfAttention.k.weight") != -1
+                    or key.find("SelfAttention.v.weight") != -1
+            )
+    ):
+        pass
+    elif key.find("encoder.embed_tokens.weight") != -1 or \
+            key.find("decoder.embed_tokens.weight") != -1:
+        LOGGER.warning(f"Not save {key}, using shared.weight directly.")
+    else:
+        LOGGER.warning(f"cannot find key '{key}' with shape {val.shape}")
+
+
+def convert_checkpoint(args):
+    saved_dir = Path(args.saved_dir) / f"{args.inference_tensor_para_size:d}-gpu"
+    saved_dir.mkdir(parents=True, exist_ok=True)
+
+    if args.encoder_only:
+        t5_model = T5EncoderModel.from_pretrained(args.in_file)
+    else:
+        t5_model = T5ForConditionalGeneration.from_pretrained(args.in_file)
+
+    config = configparser.ConfigParser()
+
+    if t5_model.encoder.config.feed_forward_proj.find("gated") != -1:
+        new_configs["structure"]["use_gated_activation"] = "1"
+
+    config["encoder"] = {}
+    for key, val in t5_model.encoder.config.to_dict().items():
+        config["encoder"][key] = f"{val}"
+    config["encoder"]["weight_data_type"] = args.weight_data_type
+    config["decoder"] = {}
+    if not args.encoder_only:
+        for key, val in t5_model.decoder.config.to_dict().items():
+            config["decoder"][key] = f"{val}"
+        config["decoder"]["weight_data_type"] = args.weight_data_type
+
+    for key, val in rename_mapping.items():
+        config['encoder'][val] = config['encoder'].pop(key)
+        if not args.encoder_only:
+            config['decoder'][val] = config['decoder'].pop(key)
+    for key, val in new_configs.items():
+        config[key] = {}
+        for val_key, val_val in val.items():
+            config[key][val_key] = val_val
+    with open((saved_dir / f"config.ini").as_posix(), 'w') as configfile:
+        config.write(configfile)
+    np_weight_data_type = get_weight_data_type(args.weight_data_type)
+
+    i_gpu_num = args.inference_tensor_para_size
+
+    pool = multiprocessing.Pool(args.processes)
+    pool.starmap_async(split_and_convert_process,
+                       [(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
+                        for name, param in t5_model.state_dict().items()])
+
+    pool.close()
+    pool.join()
+
+    if not args.encoder_only:
+        fuse_decoder_qkv(t5_model, i_gpu_num, saved_dir, np_weight_data_type)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("-saved_dir", "-o", type=str, help="file name of output file", required=True)
+    parser.add_argument("-in_file", "-i", type=str, help="file name of input checkpoint file", required=True)
+    parser.add_argument("-inference_tensor_para_size", "-i_g", type=int, help="How many gpus for inference",
+                        required=True)
+    parser.add_argument("-processes", "-p", type=int, help="How many processes to spawn for conversion (default: 4)",
+                        default=4)
+    parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"])
+    parser.add_argument("--encoder_only", "-e", action="store_true")
+    parser.add_argument("--verbose", action="store_true", help="Provide verbose messages")
+    args = parser.parse_args()
+    log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO, format=log_format)
+    LOGGER.info("\n=============== Argument ===============")
+    for key in vars(args):
+        LOGGER.info(f"{key}: {vars(args)[key]}")
+    LOGGER.info("========================================")
+
+    start_time = datetime.now()
+    convert_checkpoint(args)
+    stop_time = datetime.now()
+    run_time = (stop_time - start_time)
+    LOGGER.info("Spend {} (h:m:s) to convert the model".format(run_time))

From 641e46dcf4942d0530d9c889e26a68e0932c21ed Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 20:49:05 -0700
Subject: [PATCH 020/262] commit

---
 examples/pytorch/bart/translate_example.py    |  4 ++--
 .../utils/huggingface_bart_ckpt_convert.py    | 19 ++++++-------------
 .../triton_backend/bart/BartTritonModel.cc    |  5 -----
 .../triton_backend/bart/BartTritonModel.h     |  5 +++--
 4 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/examples/pytorch/bart/translate_example.py b/examples/pytorch/bart/translate_example.py
index 3d32f9907..db2db77d5 100644
--- a/examples/pytorch/bart/translate_example.py
+++ b/examples/pytorch/bart/translate_example.py
@@ -213,7 +213,7 @@ def translate(args_dict):
                                      config.decoder_start_token_id, config.eos_token_id, config.vocab_size,
                                      tensor_para_size=tensor_para_size, pipeline_para_size=pipeline_para_size, 
                                      bart_with_bias=bart_with_bias, mbart=is_mbart,
-                                     position_embedding_type=position_embedding_type, 
+                                     position_embedding_type=position_embedding_type,
                                      activation_type=activation_type, layernorm_type=layernorm_type)
 
         ft_bart = FTBart(ft_encoder, ft_decoding)
@@ -375,4 +375,4 @@ def translate(args_dict):
     args = parser.parse_args()
     log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
     logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO, format=log_format)
-    translate(vars(args))
\ No newline at end of file
+    translate(vars(args))
diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 6763c940d..94fcc77f0 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -34,7 +34,7 @@
 
 rename_mapping = {"relative_attention_num_buckets": "relative_attention_num_buckets_or_max_pos_seq_len"}
 new_configs = {
-    "structure": {"t5_with_bias": "false", "use_gated_activation": "false", "position_embedding_type": "relative"}}
+    "structure": {"t5_with_bias": "false", "use_gated_activation": "false"}}
 
 
 def get_weight_data_type(data_type):
@@ -155,25 +155,18 @@ def convert_checkpoint(args):
     saved_dir = Path(args.saved_dir) / f"{args.inference_tensor_para_size:d}-gpu"
     saved_dir.mkdir(parents=True, exist_ok=True)
 
-    if args.encoder_only:
-        t5_model = T5EncoderModel.from_pretrained(args.in_file)
-    else:
-        t5_model = T5ForConditionalGeneration.from_pretrained(args.in_file)
+    bart_model = BartForConditionalGeneration.from_pretrained(args.in_file)
 
     config = configparser.ConfigParser()
 
-    if t5_model.encoder.config.feed_forward_proj.find("gated") != -1:
-        new_configs["structure"]["use_gated_activation"] = "1"
-
     config["encoder"] = {}
-    for key, val in t5_model.encoder.config.to_dict().items():
+    for key, val in bart_model.encoder.config.to_dict().items():
         config["encoder"][key] = f"{val}"
     config["encoder"]["weight_data_type"] = args.weight_data_type
     config["decoder"] = {}
-    if not args.encoder_only:
-        for key, val in t5_model.decoder.config.to_dict().items():
-            config["decoder"][key] = f"{val}"
-        config["decoder"]["weight_data_type"] = args.weight_data_type
+    for key, val in bart_model.decoder.config.to_dict().items():
+        config["decoder"][key] = f"{val}"
+    config["decoder"]["weight_data_type"] = args.weight_data_type
 
     for key, val in rename_mapping.items():
         config['encoder'][val] = config['encoder'].pop(key)
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 334d9eb5f..ee6e93ac8 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -76,9 +76,6 @@ BartTritonModel<T>::BartTritonModel(INIReader reader, std::string model_dir): mo
     tensor_para_size_         = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     pipeline_para_size_       = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
     enable_custom_all_reduce_ = reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0);
-    bart_with_bias_           = reader.GetBoolean("structure", "bart_with_bias", true);
-    use_gated_activation_     = reader.GetBoolean("structure", "use_gated_activation", false);
-    q_scaling_    = bart_with_bias_ ? 1.0f : (1.0f / (sqrt(encoder_size_per_head_) * 1.0f));
     max_distance_ = 128;  // use default value of huggingface here
 }
 
@@ -131,8 +128,6 @@ BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
     tie_word_embeddings_ = reader.GetBoolean("decoder", "tie_word_embeddings", true);
 
     // common settings
-    bart_with_bias_         = reader.GetBoolean("structure", "bart_with_bias", false);
-    use_gated_activation_ = reader.GetBoolean("structure", "use_gated_activation", false);
     activation_type_      = ft::getActivationType(reader.Get("encoder", "feed_forward_proj"));
     position_embedding_type_ =
         ft::PositionEmbeddingType(reader.Get("structure", "position_embedding_type", "relative") == "relative" ? 0 : 1);
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.h b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
index 135f084f2..955cc2e0f 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.h
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
@@ -74,7 +74,7 @@ struct BartTritonModel: public AbstractTransformerModel {
     size_t                  decoding_vocab_size_;
     size_t                  decoding_num_bucket_or_max_pos_seq_len_;
 
-    float  q_scaling_;
+    float  q_scaling_ = 1.f;
 
     size_t max_distance_;
     int    start_id_;
@@ -91,8 +91,9 @@ struct BartTritonModel: public AbstractTransformerModel {
 
     // bart structure difference
     bool                      bart_with_bias_;
+    // TODO(zhwang): support mbart.
     bool                      mbart_para_ = false;
-    bool                      use_gated_activation_;
+    bool                      use_gated_activation_ = false;
     ft::PositionEmbeddingType position_embedding_type_ = ft::PositionEmbeddingType::absolute;
     ft::ActivationType        activation_type_;
     ft::LayerNormType         layernorm_type_ = ft::LayerNormType::post_layernorm;

From e25ee287af28b7a0c80156e08adea8d1f13e2b66 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 22:15:46 -0700
Subject: [PATCH 021/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 45 +++++++++----------
 .../triton_backend/bart/BartTritonModel.cc    | 27 +++++------
 .../triton_backend/bart/BartTritonModel.h     |  6 +--
 3 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 94fcc77f0..991172172 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -25,17 +25,13 @@
 dir_path = os.path.dirname(os.path.realpath(__file__))
 sys.path.append(dir_path + "/../../../../3rdparty/transformers/src/")
 
-from transformers import BartForConditionalGeneration, BartEncoderModel
+from transformers import BartForConditionalGeneration
 
 import numpy as np
 import torch  # pytype: disable=import-error
 
 LOGGER = logging.getLogger(__name__)
 
-rename_mapping = {"relative_attention_num_buckets": "relative_attention_num_buckets_or_max_pos_seq_len"}
-new_configs = {
-    "structure": {"t5_with_bias": "false", "use_gated_activation": "false"}}
-
 
 def get_weight_data_type(data_type):
     if data_type == "fp32":
@@ -160,23 +156,24 @@ def convert_checkpoint(args):
     config = configparser.ConfigParser()
 
     config["encoder"] = {}
-    for key, val in bart_model.encoder.config.to_dict().items():
-        config["encoder"][key] = f"{val}"
-    config["encoder"]["weight_data_type"] = args.weight_data_type
+    config["encoder"]["num_heads"] = bart_model.config.encoder_attention_heads
+    config["encoder"]["d_kv"] = bart_model.config.d_model // bart_model.config.encoder_attention_heads
+    config["encoder"]["d_model"] = bart_model.config.d_model
+    config["encoder"]["d_ff"] = bart_model.config.encoder_ffn_dim
+    config["encoder"]["num_layers"] = bart_model.config.encoder_layers
+    config["encoder"]["vocab_size"] = bart_model.config.vocab_size
+    config["encoder"]["max_pos_seq_len"] = bart_model.config.max_position_embeddings
+
     config["decoder"] = {}
-    for key, val in bart_model.decoder.config.to_dict().items():
-        config["decoder"][key] = f"{val}"
-    config["decoder"]["weight_data_type"] = args.weight_data_type
-
-    for key, val in rename_mapping.items():
-        config['encoder'][val] = config['encoder'].pop(key)
-        if not args.encoder_only:
-            config['decoder'][val] = config['decoder'].pop(key)
-    for key, val in new_configs.items():
-        config[key] = {}
-        for val_key, val_val in val.items():
-            config[key][val_key] = val_val
-    with open((saved_dir / f"config.ini").as_posix(), 'w') as configfile:
+    config["encoder"]["num_heads"] = bart_model.config.decoder_attention_heads
+    config["encoder"]["d_kv"] = bart_model.config.d_model // bart_model.config.decoder_attention_heads
+    config["encoder"]["d_model"] = bart_model.config.d_model
+    config["encoder"]["d_ff"] = bart_model.config.decoder_ffn_dim
+    config["encoder"]["num_layers"] = bart_model.config.decoder_layers
+    config["encoder"]["vocab_size"] = bart_model.config.vocab_size
+    config["encoder"]["max_pos_seq_len"] = bart_model.config.max_position_embeddings
+
+    with open((saved_dir / "config.ini").as_posix(), 'w') as configfile:
         config.write(configfile)
     np_weight_data_type = get_weight_data_type(args.weight_data_type)
 
@@ -185,13 +182,12 @@ def convert_checkpoint(args):
     pool = multiprocessing.Pool(args.processes)
     pool.starmap_async(split_and_convert_process,
                        [(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
-                        for name, param in t5_model.state_dict().items()])
+                        for name, param in bart_model.state_dict().items()])
 
     pool.close()
     pool.join()
 
-    if not args.encoder_only:
-        fuse_decoder_qkv(t5_model, i_gpu_num, saved_dir, np_weight_data_type)
+    fuse_decoder_qkv(bart_model, i_gpu_num, saved_dir, np_weight_data_type)
 
 
 if __name__ == "__main__":
@@ -203,7 +199,6 @@ def convert_checkpoint(args):
     parser.add_argument("-processes", "-p", type=int, help="How many processes to spawn for conversion (default: 4)",
                         default=4)
     parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"])
-    parser.add_argument("--encoder_only", "-e", action="store_true")
     parser.add_argument("--verbose", action="store_true", help="Provide verbose messages")
     args = parser.parse_args()
     log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index ee6e93ac8..dcd2a2c6c 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -58,8 +58,7 @@ BartTritonModel<T>::BartTritonModel(INIReader reader, std::string model_dir): mo
     encoder_inter_size_    = reader.GetInteger("encoder", "d_ff");
     encoder_num_layer_     = reader.GetInteger("encoder", "num_layers");
     encoder_vocab_size_    = reader.GetInteger("encoder", "vocab_size");
-    encoder_num_bucket_or_max_pos_seq_len_ =
-        reader.GetInteger("encoder", "relative_attention_num_buckets_or_max_pos_seq_len");
+    encoder_max_pos_seq_len_ = reader.GetInteger("encoder", "max_pos_seq_len");
 
     // decoding
     decoding_head_num_      = reader.GetInteger("decoder", "num_heads");
@@ -68,8 +67,7 @@ BartTritonModel<T>::BartTritonModel(INIReader reader, std::string model_dir): mo
     decoding_inter_size_    = reader.GetInteger("decoder", "d_ff");
     decoding_num_layer_     = reader.GetInteger("decoder", "num_layers");
     decoding_vocab_size_    = reader.GetInteger("decoder", "vocab_size");
-    decoding_num_bucket_or_max_pos_seq_len_ =
-        reader.GetInteger("decoder", "relative_attention_num_buckets_or_max_pos_seq_len");
+    decoding_max_pos_seq_len_ = reader.GetInteger("decoder", "max_pos_seq_len");
 
     start_id_                 = reader.GetInteger("decoder", "decoder_start_token_id");
     end_id_                   = reader.GetInteger("decoder", "eos_token_id");
@@ -110,8 +108,8 @@ BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
     encoder_inter_size_    = reader.GetInteger("encoder", "d_ff");
     encoder_num_layer_     = reader.GetInteger("encoder", "num_layers");
     encoder_vocab_size_    = reader.GetInteger("encoder", "vocab_size");
-    encoder_num_bucket_or_max_pos_seq_len_ =
-        reader.GetInteger("encoder", "relative_attention_num_buckets_or_max_pos_seq_len");
+    encoder_max_pos_seq_len_ =
+        reader.GetInteger("encoder", "max_pos_seq_len");
 
     // decoding
     decoding_head_num_      = reader.GetInteger("decoder", "num_heads");
@@ -120,8 +118,8 @@ BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
     decoding_inter_size_    = reader.GetInteger("decoder", "d_ff");
     decoding_num_layer_     = reader.GetInteger("decoder", "num_layers");
     decoding_vocab_size_    = reader.GetInteger("decoder", "vocab_size");
-    decoding_num_bucket_or_max_pos_seq_len_ =
-        reader.GetInteger("decoder", "relative_attention_num_buckets_or_max_pos_seq_len");
+    decoding_max_pos_seq_len_ =
+        reader.GetInteger("decoder", "max_pos_seq_len");
 
     start_id_            = reader.GetInteger("decoder", "decoder_start_token_id");
     end_id_              = reader.GetInteger("decoder", "eos_token_id");
@@ -129,9 +127,6 @@ BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
 
     // common settings
     activation_type_      = ft::getActivationType(reader.Get("encoder", "feed_forward_proj"));
-    position_embedding_type_ =
-        ft::PositionEmbeddingType(reader.Get("structure", "position_embedding_type", "relative") == "relative" ? 0 : 1);
-    q_scaling_ = bart_with_bias_ ? 1.0f : (1.0f / (sqrt(encoder_size_per_head_) * 1.0f));
 
     max_distance_ = 128;  // use default value of huggingface here
 }
@@ -184,7 +179,7 @@ BartTritonModel<T>::createModelInstance(int
     // TODO(bhsueh) not support fused mha
     // NOTE: fmha doesn't support bart-style relative position bias
     ft::AttentionType attention_type =
-        ft::getAttentionType<T>(encoder_size_per_head_, sm_, true, encoder_num_bucket_or_max_pos_seq_len_, false);
+        ft::getAttentionType<T>(encoder_size_per_head_, sm_, true, encoder_max_pos_seq_len_, false);
 
     ft::NcclParam tensor_para_   = nccl_params.first[comms_rank];
     ft::NcclParam pipeline_para_ = nccl_params.second[comms_rank];
@@ -196,7 +191,7 @@ BartTritonModel<T>::createModelInstance(int
                                                                        encoder_inter_size_,
                                                                        encoder_d_model_,
                                                                        encoder_num_layer_,
-                                                                       encoder_num_bucket_or_max_pos_seq_len_,
+                                                                       encoder_max_pos_seq_len_,
                                                                        max_distance_,
                                                                        sm_,
                                                                        q_scaling_,
@@ -223,7 +218,7 @@ BartTritonModel<T>::createModelInstance(int
                                                                           decoding_d_model_,
                                                                           decoding_num_layer_,
                                                                           decoding_vocab_size_,
-                                                                          decoding_num_bucket_or_max_pos_seq_len_,
+                                                                          decoding_max_pos_seq_len_,
                                                                           max_distance_,
                                                                           q_scaling_,
                                                                           start_id_,
@@ -272,7 +267,7 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                  encoder_inter_size_,
                                                  encoder_vocab_size_,
                                                  encoder_num_layer_,
-                                                 encoder_num_bucket_or_max_pos_seq_len_,
+                                                 encoder_max_pos_seq_len_,
                                                  tensor_para_size_,
                                                  tensor_para_rank,
                                                  pipeline_para_size_,
@@ -290,7 +285,7 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                   decoding_vocab_size_,
                                                   decoding_num_layer_,
                                                   encoder_d_model_,
-                                                  decoding_num_bucket_or_max_pos_seq_len_,
+                                                  decoding_max_pos_seq_len_,
                                                   tensor_para_size_,
                                                   tensor_para_rank,
                                                   pipeline_para_size_,
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.h b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
index 955cc2e0f..b5b441835 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.h
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
@@ -63,7 +63,7 @@ struct BartTritonModel: public AbstractTransformerModel {
     size_t                  encoder_inter_size_;
     size_t                  encoder_num_layer_;
     size_t                  encoder_vocab_size_;
-    size_t                  encoder_num_bucket_or_max_pos_seq_len_;
+    size_t                  encoder_max_pos_seq_len_;
 
     // decoding
     size_t                  decoding_head_num_;
@@ -72,7 +72,7 @@ struct BartTritonModel: public AbstractTransformerModel {
     size_t                  decoding_inter_size_;
     size_t                  decoding_num_layer_;
     size_t                  decoding_vocab_size_;
-    size_t                  decoding_num_bucket_or_max_pos_seq_len_;
+    size_t                  decoding_max_pos_seq_len_;
 
     float  q_scaling_ = 1.f;
 
@@ -90,7 +90,7 @@ struct BartTritonModel: public AbstractTransformerModel {
     std::vector<std::shared_ptr<ft::BartDecodingWeight<T>>> decoding_shared_weights_;
 
     // bart structure difference
-    bool                      bart_with_bias_;
+    bool                      bart_with_bias_ = true;
     // TODO(zhwang): support mbart.
     bool                      mbart_para_ = false;
     bool                      use_gated_activation_ = false;

From bbca59f89af57620e1f5a7096ffa1d0f47103825 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 22:20:14 -0700
Subject: [PATCH 022/262] commit

---
 src/fastertransformer/triton_backend/bart/BartTritonModel.cc | 5 ++---
 src/fastertransformer/triton_backend/bart/BartTritonModel.h  | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index dcd2a2c6c..2082a5e62 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -123,7 +123,6 @@ BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
 
     start_id_            = reader.GetInteger("decoder", "decoder_start_token_id");
     end_id_              = reader.GetInteger("decoder", "eos_token_id");
-    tie_word_embeddings_ = reader.GetBoolean("decoder", "tie_word_embeddings", true);
 
     // common settings
     activation_type_      = ft::getActivationType(reader.Get("encoder", "feed_forward_proj"));
@@ -310,12 +309,12 @@ std::string BartTritonModel<T>::toString()
        << "\n    encoder_head_num_: " << encoder_head_num_ << "\n    encoder_size_per_head_: " << encoder_size_per_head_
        << "\n    encoder_d_model_: " << encoder_d_model_ << "\n    encoder_inter_size_: " << encoder_inter_size_
        << "\n    encoder_num_layer_: " << encoder_num_layer_ << "\n    encoder_vocab_size_: " << encoder_vocab_size_
-       << "\n    encoder_num_bucket_or_max_pos_seq_len_: " << encoder_num_bucket_or_max_pos_seq_len_
+       << "\n    encoder_max_pos_seq_len_: " << encoder_max_pos_seq_len_
        << "\n    decoding_head_num_: " << decoding_head_num_
        << "\n    decoding_size_per_head_: " << decoding_size_per_head_
        << "\n    decoding_d_model_: " << decoding_d_model_ << "\n    decoding_inter_size_: " << decoding_inter_size_
        << "\n    decoding_num_layer_: " << decoding_num_layer_ << "\n    decoding_vocab_size_: " << decoding_vocab_size_
-       << "\n    decoding_num_bucket_or_max_pos_seq_len_: " << decoding_num_bucket_or_max_pos_seq_len_
+       << "\n    decoding_max_pos_seq_len_: " << decoding_max_pos_seq_len_
        << "\n    bart_with_bias_: " << bart_with_bias_
        << "\n    use_gated_activation_: " << use_gated_activation_
        << "\n   position_embedding_type_: " << position_embedding_type_string << "\n    start_id_: " << start_id_
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.h b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
index b5b441835..47ab7f08f 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.h
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.h
@@ -80,7 +80,7 @@ struct BartTritonModel: public AbstractTransformerModel {
     int    start_id_;
     int    end_id_;
 
-    bool tie_word_embeddings_;
+    bool tie_word_embeddings_ = false;
 
     size_t tensor_para_size_;
     size_t pipeline_para_size_;

From f7f3f2e19d4409c80aea6de8165ee56934e4d788 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 22:24:26 -0700
Subject: [PATCH 023/262] commit

---
 .../models/bart/BartEncoderWeight.cc          | 52 ++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index 47028260a..9f58bd250 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -249,7 +249,57 @@ void BartEncoderWeight<T>::loadModel(std::string dir_path)
 {
     FT_LOG_DEBUG("BartEncoderWeight " + std::string(__func__) + " start");
 
-    FT_LOG_DEBUG("Megatron BART support TBD");
+    FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "encoder");
+    FT_CHECK(is_maintain_buffer == true);
+
+    loadWeightFromBin<T>(
+        weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/encoder.final_layer_norm.weight.bin", model_file_type);
+    if (position_embedding_type == PositionEmbeddingType::absolute) {
+        loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/shared.ape.bin", model_file_type);
+    }
+    else {
+        loadWeightFromBin<T>(weights_ptr[1],
+                             {(size_t)weights_size[1]},
+                             dir_path + "/encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight."
+                                 + std::to_string(tensor_para_rank_) + ".bin",
+                             model_file_type);
+    }
+    loadWeightFromBin<T>(weights_ptr[2], {(size_t)weights_size[2]}, dir_path + "/shared.weight_T.bin", model_file_type);
+    if (bart_with_bias) {
+        loadWeightFromBin<T>(weights_ptr[3],
+                             {(size_t)weights_size[3]},
+                             dir_path + "/encoder.final_layer_norm.bias.bin",
+                             model_file_type);
+    }
+
+    // prompt table: load weights from bin
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            std::string task_name      = prompt.first;
+            int         task_name_id   = prompt.second.first;
+            int         prompt_length  = prompt.second.second;
+            size_t      task_weight_id = weights_num_ + (size_t)task_name_id;
+
+            std::string prompt_weight_path_name = (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) ?
+                                                      (dir_path + "/model.prompt_table." + task_name + ".weight.bin") :
+                                                      (dir_path + "/model.prefix_prompt." + task_name + ".weight."
+                                                       + std::to_string(tensor_para_rank_) + ".bin");
+            FT_LOG_DEBUG("load prompt_weight_path_name: %s", prompt_weight_path_name.c_str());
+            if (prompt_length > 0) {
+                loadWeightFromBin<T>(weights_ptr[task_weight_id],
+                                     {prompt_length * prompt_token_weight_size_},
+                                     prompt_weight_path_name,
+                                     model_file_type);
+            }
+        }
+    }
+
+    for (int l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l)) {
+            t5_encoder_layer_weights[l]->loadModel(dir_path + "/encoder.block." + std::to_string(l) + ".",
+                                                   model_file_type);
+        }
+    }
 
     FT_LOG_DEBUG("BartEncoderWeight " + std::string(__func__) + " end");
 }

From 1be58b7fd53810adea07482199f70bf729a62359 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 22:24:46 -0700
Subject: [PATCH 024/262] commit

---
 .../models/bart/BartEncoderWeight.cc          | 22 -------------------
 1 file changed, 22 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index 9f58bd250..876aa1e38 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -272,28 +272,6 @@ void BartEncoderWeight<T>::loadModel(std::string dir_path)
                              model_file_type);
     }
 
-    // prompt table: load weights from bin
-    if (malloc_load_prompt_weights_) {
-        for (auto const& prompt : prompt_learning_pair_) {
-            std::string task_name      = prompt.first;
-            int         task_name_id   = prompt.second.first;
-            int         prompt_length  = prompt.second.second;
-            size_t      task_weight_id = weights_num_ + (size_t)task_name_id;
-
-            std::string prompt_weight_path_name = (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) ?
-                                                      (dir_path + "/model.prompt_table." + task_name + ".weight.bin") :
-                                                      (dir_path + "/model.prefix_prompt." + task_name + ".weight."
-                                                       + std::to_string(tensor_para_rank_) + ".bin");
-            FT_LOG_DEBUG("load prompt_weight_path_name: %s", prompt_weight_path_name.c_str());
-            if (prompt_length > 0) {
-                loadWeightFromBin<T>(weights_ptr[task_weight_id],
-                                     {prompt_length * prompt_token_weight_size_},
-                                     prompt_weight_path_name,
-                                     model_file_type);
-            }
-        }
-    }
-
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
             t5_encoder_layer_weights[l]->loadModel(dir_path + "/encoder.block." + std::to_string(l) + ".",

From 3011aecf50f75026f4df92c8d5277da0d1a33d8b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 22:30:47 -0700
Subject: [PATCH 025/262] commit

---
 src/fastertransformer/models/bart/BartEncoderWeight.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index 876aa1e38..412af6863 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -274,7 +274,7 @@ void BartEncoderWeight<T>::loadModel(std::string dir_path)
 
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
-            t5_encoder_layer_weights[l]->loadModel(dir_path + "/encoder.block." + std::to_string(l) + ".",
+            bart_encoder_layer_weights[l]->loadModel(dir_path + "/encoder.block." + std::to_string(l) + ".",
                                                    model_file_type);
         }
     }

From 1b421593bbba4551bfe8b13cb963bd4cb46a94b4 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 22:56:48 -0700
Subject: [PATCH 026/262] commit

---
 .../models/bart/BartEncoderWeight.cc          | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index 412af6863..c041d7ba2 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -252,19 +252,15 @@ void BartEncoderWeight<T>::loadModel(std::string dir_path)
     FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "encoder");
     FT_CHECK(is_maintain_buffer == true);
 
+    /*
+        // 6: [0] absolute/relative positional embedding weight [1] word embedding weight [2] pre-LN weight [3] post-LN
+    // weight [4] pre-LN bias[5] post-LN bias. Assuming both mBART and bias
+    */
+
+    loadWeightFromBin<T>(weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/shared.ape.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/shared.weight_T.bin", model_file_type);
     loadWeightFromBin<T>(
-        weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/encoder.final_layer_norm.weight.bin", model_file_type);
-    if (position_embedding_type == PositionEmbeddingType::absolute) {
-        loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/shared.ape.bin", model_file_type);
-    }
-    else {
-        loadWeightFromBin<T>(weights_ptr[1],
-                             {(size_t)weights_size[1]},
-                             dir_path + "/encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight."
-                                 + std::to_string(tensor_para_rank_) + ".bin",
-                             model_file_type);
-    }
-    loadWeightFromBin<T>(weights_ptr[2], {(size_t)weights_size[2]}, dir_path + "/shared.weight_T.bin", model_file_type);
+        weights_ptr[2], {(size_t)weights_size[2]}, dir_path + "/encoder.final_layer_norm.weight.bin", model_file_type);
     if (bart_with_bias) {
         loadWeightFromBin<T>(weights_ptr[3],
                              {(size_t)weights_size[3]},

From 53185a563a36d0bb190f2239f424ace209cd98c1 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 17 Sep 2023 23:18:34 -0700
Subject: [PATCH 027/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 28 ++++++++++---------
 .../models/bart/BartEncoderWeight.cc          |  4 +--
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 991172172..f972bf1e4 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -66,26 +66,28 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
 def split_and_convert_process(key, val, factor, saved_dir):
     if val.ndim == 2:
         val = val.transpose(1, 0)
-    saved_key = key
     LOGGER.debug(f"key: {key}, val.shape: {val.shape}")
 
-    if key.find("shared.weight") != -1:
+    if key.find("encoder.embed_positions.weight") != -1:
         # shared weights, only need to convert the weights of rank 0
-        saved_path = saved_dir / f"{saved_key}.bin"
+        saved_path = saved_dir / "encoder.embed_positions.weight.bin"
+        val[2:, :].tofile(saved_path.as_posix())
+    if key.find("encoder.embed_tokens.weight") != -1:
+        # shared weights, only need to convert the weights of rank 0
+        saved_path = saved_dir / "encoder.embed_tokens.weight.bin"
         val.tofile(saved_path.as_posix())
-
-        saved_path = saved_dir / f"{saved_key}_T.bin"
-        val.T.tofile(saved_path.as_posix())
-    elif key.find("lm_head.weight") != -1:
-        # lm_head weights, only need to convert the weights of rank 0
-        val = val.transpose(1, 0)  # For lm_head, we use TN gemm to compute, so we don't need to transpose
-        saved_path = saved_dir / f"{saved_key}.bin"
+    elif key.find("encoder.layernorm_embedding.weight") != -1:
+        # shared weights, only need to convert the weights of rank 0
+        saved_path = saved_dir / "encoder.final_layer_norm.weight.bin"
         val.tofile(saved_path.as_posix())
-
-    elif key.find("layer_norm.weight") != -1:
+    elif key.find("encoder.layernorm_embedding.bias") != -1:
         # shared weights, only need to convert the weights of rank 0
-        saved_path = saved_dir / f"{saved_key}.bin"
+        saved_path = saved_dir / "encoder.final_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
+    elif key.find("encoder.embed_positions.weight") != -1:
+        # shared weights, only need to convert the weights of rank 0
+        saved_path = saved_dir / "encoder.shared.ape.bin"
+        val[2:, :].tofile(saved_path.as_posix())
 
     elif (
             key.find("SelfAttention.o.weight") != -1
diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index c041d7ba2..94d97d1ab 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -257,8 +257,8 @@ void BartEncoderWeight<T>::loadModel(std::string dir_path)
     // weight [4] pre-LN bias[5] post-LN bias. Assuming both mBART and bias
     */
 
-    loadWeightFromBin<T>(weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/shared.ape.bin", model_file_type);
-    loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/shared.weight_T.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/encoder.embed_positions.weight.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/encoder.embed_tokens.weight.bin", model_file_type);
     loadWeightFromBin<T>(
         weights_ptr[2], {(size_t)weights_size[2]}, dir_path + "/encoder.final_layer_norm.weight.bin", model_file_type);
     if (bart_with_bias) {

From 411892b0fc4db56e6991458cac0871f40b7ca56c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 20:24:56 -0700
Subject: [PATCH 028/262] commit

---
 .../pytorch/bart/utils/huggingface_bart_ckpt_convert.py     | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index f972bf1e4..b6c0efa29 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -69,11 +69,9 @@ def split_and_convert_process(key, val, factor, saved_dir):
     LOGGER.debug(f"key: {key}, val.shape: {val.shape}")
 
     if key.find("encoder.embed_positions.weight") != -1:
-        # shared weights, only need to convert the weights of rank 0
         saved_path = saved_dir / "encoder.embed_positions.weight.bin"
         val[2:, :].tofile(saved_path.as_posix())
     if key.find("encoder.embed_tokens.weight") != -1:
-        # shared weights, only need to convert the weights of rank 0
         saved_path = saved_dir / "encoder.embed_tokens.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("encoder.layernorm_embedding.weight") != -1:
@@ -84,10 +82,6 @@ def split_and_convert_process(key, val, factor, saved_dir):
         # shared weights, only need to convert the weights of rank 0
         saved_path = saved_dir / "encoder.final_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
-    elif key.find("encoder.embed_positions.weight") != -1:
-        # shared weights, only need to convert the weights of rank 0
-        saved_path = saved_dir / "encoder.shared.ape.bin"
-        val[2:, :].tofile(saved_path.as_posix())
 
     elif (
             key.find("SelfAttention.o.weight") != -1

From 56bb47c9b3168175878c5f30e6e5c670981ee9b4 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 20:25:14 -0700
Subject: [PATCH 029/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index b6c0efa29..4e86597ce 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -75,11 +75,9 @@ def split_and_convert_process(key, val, factor, saved_dir):
         saved_path = saved_dir / "encoder.embed_tokens.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("encoder.layernorm_embedding.weight") != -1:
-        # shared weights, only need to convert the weights of rank 0
         saved_path = saved_dir / "encoder.final_layer_norm.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("encoder.layernorm_embedding.bias") != -1:
-        # shared weights, only need to convert the weights of rank 0
         saved_path = saved_dir / "encoder.final_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
 

From 30f164292abf9c30c46c1110bc7f6192a88d922c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 21:12:34 -0700
Subject: [PATCH 030/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 112 ++++++++++--------
 1 file changed, 63 insertions(+), 49 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 4e86597ce..0e37c292d 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -80,60 +80,74 @@ def split_and_convert_process(key, val, factor, saved_dir):
     elif key.find("encoder.layernorm_embedding.bias") != -1:
         saved_path = saved_dir / "encoder.final_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
-
     elif (
-            key.find("SelfAttention.o.weight") != -1
-            or key.find("EncDecAttention.o.weight") != -1
-            or key.find("DenseReluDense.wo.weight") != -1
+        key.find("self_attn.k_proj.weight") != -1
+        or key.find("self_attn.v_proj.weight") != -1
+        or key.find("self_attn.q_proj.weight") != -1
     ):
         split_vals = np.split(val, factor, axis=0)
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.self_attn')[0])
+        qkv = key.split('self_attn.')[:1]
         for j in range(factor):
-            saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
-            split_vals[j].tofile(saved_path.as_posix())
-
-    elif (
-            key.find("DenseReluDense.wi.weight") != -1
-            or (key.find("encoder") != -1 and (
-            key.find("SelfAttention.q.weight") != -1
-            or key.find("SelfAttention.k.weight") != -1
-            or key.find("SelfAttention.v.weight") != -1
-    )
-            )
-            or key.find("EncDecAttention.q.weight") != -1
-            or key.find("EncDecAttention.k.weight") != -1
-            or key.find("EncDecAttention.v.weight") != -1
-    ):
-        split_vals = np.split(val, factor, axis=-1)
-        for j in range(factor):
-            saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{qkv}.weight{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
-    elif (
-            key.find("DenseReluDense.wi_0.weight") != -1
-            or key.find("DenseReluDense.wi_1.weight") != -1
-    ):
-        # For gated activation.
-        if key.find("DenseReluDense.wi_0.weight") != -1:
-            saved_key = key.replace("wi_0", "wi")
-        elif key.find("DenseReluDense.wi_1.weight") != -1:
-            saved_key = key.replace("wi_1", "wi2")
-        split_vals = np.split(val, factor, axis=-1)
-        for j in range(factor):
-            saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
-            split_vals[j].tofile(saved_path.as_posix())
-    elif key.find("relative_attention_bias") != -1:
-        split_vals = np.split(val, factor, axis=0)
-        for j in range(factor):
-            saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
-            split_vals[j].tofile(saved_path.as_posix())
-    elif (
-            key.find("decoder") != -1 and
-            (
-                    key.find("SelfAttention.q.weight") != -1
-                    or key.find("SelfAttention.k.weight") != -1
-                    or key.find("SelfAttention.v.weight") != -1
-            )
-    ):
-        pass
+    # elif (
+    #         key.find("SelfAttention.o.weight") != -1
+    #         or key.find("EncDecAttention.o.weight") != -1
+    #         or key.find("DenseReluDense.wo.weight") != -1
+    # ):
+    #     split_vals = np.split(val, factor, axis=0)
+    #     for j in range(factor):
+    #         saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
+    #         split_vals[j].tofile(saved_path.as_posix())
+
+    # elif (
+    #         key.find("DenseReluDense.wi.weight") != -1
+    #         or (key.find("encoder") != -1 and (
+    #         key.find("SelfAttention.q.weight") != -1
+    #         or key.find("SelfAttention.k.weight") != -1
+    #         or key.find("SelfAttention.v.weight") != -1
+    # )
+    #         )
+    #         or key.find("EncDecAttention.q.weight") != -1
+    #         or key.find("EncDecAttention.k.weight") != -1
+    #         or key.find("EncDecAttention.v.weight") != -1
+    # ):
+    #     split_vals = np.split(val, factor, axis=-1)
+    #     for j in range(factor):
+    #         saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
+    #         split_vals[j].tofile(saved_path.as_posix())
+    # elif (
+    #         key.find("DenseReluDense.wi_0.weight") != -1
+    #         or key.find("DenseReluDense.wi_1.weight") != -1
+    # ):
+    #     # For gated activation.
+    #     if key.find("DenseReluDense.wi_0.weight") != -1:
+    #         saved_key = key.replace("wi_0", "wi")
+    #     elif key.find("DenseReluDense.wi_1.weight") != -1:
+    #         saved_key = key.replace("wi_1", "wi2")
+    #     split_vals = np.split(val, factor, axis=-1)
+    #     for j in range(factor):
+    #         saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
+    #         split_vals[j].tofile(saved_path.as_posix())
+    # elif key.find("relative_attention_bias") != -1:
+    #     split_vals = np.split(val, factor, axis=0)
+    #     for j in range(factor):
+    #         saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
+    #         split_vals[j].tofile(saved_path.as_posix())
+    # elif (
+    #         key.find("decoder") != -1 and
+    #         (
+    #                 key.find("SelfAttention.q.weight") != -1
+    #                 or key.find("SelfAttention.k.weight") != -1
+    #                 or key.find("SelfAttention.v.weight") != -1
+    #         )
+    # ):
+    #     pass
     elif key.find("encoder.embed_tokens.weight") != -1 or \
             key.find("decoder.embed_tokens.weight") != -1:
         LOGGER.warning(f"Not save {key}, using shared.weight directly.")

From 1cc0eaf072c73c9934fe7a207d7d4cd46e577393 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 21:16:23 -0700
Subject: [PATCH 031/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 0e37c292d..e23b3de2d 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -160,26 +160,26 @@ def convert_checkpoint(args):
     saved_dir.mkdir(parents=True, exist_ok=True)
 
     bart_model = BartForConditionalGeneration.from_pretrained(args.in_file)
-
+    hf_config = vars(bart_model.config)
     config = configparser.ConfigParser()
 
     config["encoder"] = {}
-    config["encoder"]["num_heads"] = bart_model.config.encoder_attention_heads
-    config["encoder"]["d_kv"] = bart_model.config.d_model // bart_model.config.encoder_attention_heads
-    config["encoder"]["d_model"] = bart_model.config.d_model
-    config["encoder"]["d_ff"] = bart_model.config.encoder_ffn_dim
-    config["encoder"]["num_layers"] = bart_model.config.encoder_layers
-    config["encoder"]["vocab_size"] = bart_model.config.vocab_size
-    config["encoder"]["max_pos_seq_len"] = bart_model.config.max_position_embeddings
+    config["encoder"]["num_heads"] = hf_config["encoder_attention_heads"]
+    config["encoder"]["d_kv"] = hf_config["d_model"] // hf_config["encoder_attention_heads"]
+    config["encoder"]["d_model"] = hf_config["d_model"]
+    config["encoder"]["d_ff"] = hf_config["encoder_ffn_dim"]
+    config["encoder"]["num_layers"] = hf_config["encoder_layers"]
+    config["encoder"]["vocab_size"] = hf_config["vocab_size"]
+    config["encoder"]["max_pos_seq_len"] = hf_config["max_position_embeddings"]
 
     config["decoder"] = {}
-    config["encoder"]["num_heads"] = bart_model.config.decoder_attention_heads
-    config["encoder"]["d_kv"] = bart_model.config.d_model // bart_model.config.decoder_attention_heads
-    config["encoder"]["d_model"] = bart_model.config.d_model
-    config["encoder"]["d_ff"] = bart_model.config.decoder_ffn_dim
-    config["encoder"]["num_layers"] = bart_model.config.decoder_layers
-    config["encoder"]["vocab_size"] = bart_model.config.vocab_size
-    config["encoder"]["max_pos_seq_len"] = bart_model.config.max_position_embeddings
+    config["encoder"]["num_heads"] = hf_config["decoder_attention_heads"]
+    config["encoder"]["d_kv"] = hf_config["d_model"] // hf_config["decoder_attention_heads"]
+    config["encoder"]["d_model"] = hf_config["d_model"]
+    config["encoder"]["d_ff"] = hf_config["decoder_ffn_dim"]
+    config["encoder"]["num_layers"] = hf_config["decoder_layers"]
+    config["encoder"]["vocab_size"] = hf_config["vocab_size"]
+    config["encoder"]["max_pos_seq_len"] = hf_config["max_position_embeddings"]
 
     with open((saved_dir / "config.ini").as_posix(), 'w') as configfile:
         config.write(configfile)

From ef97a9e3224909dd8cda31645a90ab0cc1b7cd7f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 21:17:23 -0700
Subject: [PATCH 032/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index e23b3de2d..4a9d5c1e4 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -164,7 +164,7 @@ def convert_checkpoint(args):
     config = configparser.ConfigParser()
 
     config["encoder"] = {}
-    config["encoder"]["num_heads"] = hf_config["encoder_attention_heads"]
+    config["encoder"]["num_heads"] = str(hf_config["encoder_attention_heads"])
     config["encoder"]["d_kv"] = hf_config["d_model"] // hf_config["encoder_attention_heads"]
     config["encoder"]["d_model"] = hf_config["d_model"]
     config["encoder"]["d_ff"] = hf_config["encoder_ffn_dim"]
@@ -173,7 +173,7 @@ def convert_checkpoint(args):
     config["encoder"]["max_pos_seq_len"] = hf_config["max_position_embeddings"]
 
     config["decoder"] = {}
-    config["encoder"]["num_heads"] = hf_config["decoder_attention_heads"]
+    config["encoder"]["num_heads"] = str(hf_config["decoder_attention_heads"])
     config["encoder"]["d_kv"] = hf_config["d_model"] // hf_config["decoder_attention_heads"]
     config["encoder"]["d_model"] = hf_config["d_model"]
     config["encoder"]["d_ff"] = hf_config["decoder_ffn_dim"]

From 440de12903bf885bedf7286e3cc02cc3a944bc7c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 21:18:25 -0700
Subject: [PATCH 033/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 4a9d5c1e4..a47b6b1fa 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -165,21 +165,21 @@ def convert_checkpoint(args):
 
     config["encoder"] = {}
     config["encoder"]["num_heads"] = str(hf_config["encoder_attention_heads"])
-    config["encoder"]["d_kv"] = hf_config["d_model"] // hf_config["encoder_attention_heads"]
-    config["encoder"]["d_model"] = hf_config["d_model"]
-    config["encoder"]["d_ff"] = hf_config["encoder_ffn_dim"]
-    config["encoder"]["num_layers"] = hf_config["encoder_layers"]
-    config["encoder"]["vocab_size"] = hf_config["vocab_size"]
-    config["encoder"]["max_pos_seq_len"] = hf_config["max_position_embeddings"]
+    config["encoder"]["d_kv"] = str(hf_config["d_model"] // hf_config["encoder_attention_heads"])
+    config["encoder"]["d_model"] = str(hf_config["d_model"])
+    config["encoder"]["d_ff"] = str(hf_config["encoder_ffn_dim"])
+    config["encoder"]["num_layers"] = str(hf_config["encoder_layers"])
+    config["encoder"]["vocab_size"] = str(hf_config["vocab_size"])
+    config["encoder"]["max_pos_seq_len"] = str(hf_config["max_position_embeddings"])
 
     config["decoder"] = {}
     config["encoder"]["num_heads"] = str(hf_config["decoder_attention_heads"])
-    config["encoder"]["d_kv"] = hf_config["d_model"] // hf_config["decoder_attention_heads"]
-    config["encoder"]["d_model"] = hf_config["d_model"]
-    config["encoder"]["d_ff"] = hf_config["decoder_ffn_dim"]
-    config["encoder"]["num_layers"] = hf_config["decoder_layers"]
-    config["encoder"]["vocab_size"] = hf_config["vocab_size"]
-    config["encoder"]["max_pos_seq_len"] = hf_config["max_position_embeddings"]
+    config["encoder"]["d_kv"] = str(hf_config["d_model"] // hf_config["decoder_attention_heads"])
+    config["encoder"]["d_model"] = str(hf_config["d_model"])
+    config["encoder"]["d_ff"] = str(hf_config["decoder_ffn_dim"])
+    config["encoder"]["num_layers"] = str(hf_config["decoder_layers"])
+    config["encoder"]["vocab_size"] = str(hf_config["vocab_size"])
+    config["encoder"]["max_pos_seq_len"] = str(hf_config["max_position_embeddings"])
 
     with open((saved_dir / "config.ini").as_posix(), 'w') as configfile:
         config.write(configfile)

From 9d1448297cbfdff5d98793a0650a2e7881a6ce82 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 21:19:25 -0700
Subject: [PATCH 034/262] commit

---
 .../bart/utils/huggingface_bart_ckpt_convert.py    | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index a47b6b1fa..f7afb2ec0 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -173,13 +173,13 @@ def convert_checkpoint(args):
     config["encoder"]["max_pos_seq_len"] = str(hf_config["max_position_embeddings"])
 
     config["decoder"] = {}
-    config["encoder"]["num_heads"] = str(hf_config["decoder_attention_heads"])
-    config["encoder"]["d_kv"] = str(hf_config["d_model"] // hf_config["decoder_attention_heads"])
-    config["encoder"]["d_model"] = str(hf_config["d_model"])
-    config["encoder"]["d_ff"] = str(hf_config["decoder_ffn_dim"])
-    config["encoder"]["num_layers"] = str(hf_config["decoder_layers"])
-    config["encoder"]["vocab_size"] = str(hf_config["vocab_size"])
-    config["encoder"]["max_pos_seq_len"] = str(hf_config["max_position_embeddings"])
+    config["decoder"]["num_heads"] = str(hf_config["decoder_attention_heads"])
+    config["decoder"]["d_kv"] = str(hf_config["d_model"] // hf_config["decoder_attention_heads"])
+    config["decoder"]["d_model"] = str(hf_config["d_model"])
+    config["decoder"]["d_ff"] = str(hf_config["decoder_ffn_dim"])
+    config["decoder"]["num_layers"] = str(hf_config["decoder_layers"])
+    config["decoder"]["vocab_size"] = str(hf_config["vocab_size"])
+    config["decoder"]["max_pos_seq_len"] = str(hf_config["max_position_embeddings"])
 
     with open((saved_dir / "config.ini").as_posix(), 'w') as configfile:
         config.write(configfile)

From f416d67bced68bb6dd368ab09697f2f0d7e59304 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 21:21:45 -0700
Subject: [PATCH 035/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index f7afb2ec0..e4ac0c688 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -91,7 +91,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
         else:
             prefix = "decoder"
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
-        qkv = key.split('self_attn.')[:1]
+        qkv = key.split('self_attn.')[1][:1]
         for j in range(factor):
             saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{qkv}.weight{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())

From 6b775c36735e9ad622e477a40b101caff202c1c6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 21:24:12 -0700
Subject: [PATCH 036/262] commit

---
 .../bart/utils/huggingface_bart_ckpt_convert.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index e4ac0c688..ded82e7d4 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -93,7 +93,22 @@ def split_and_convert_process(key, val, factor, saved_dir):
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         qkv = key.split('self_attn.')[1][:1]
         for j in range(factor):
-            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{qkv}.weight{j:d}.bin"
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{qkv}.weight.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif (
+        key.find("self_attn.k_proj.bias") != -1
+        or key.find("self_attn.v_proj.bias") != -1
+        or key.find("self_attn.q_proj.bias") != -1
+    ):
+        split_vals = np.split(val, factor, axis=0)
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.self_attn')[0])
+        qkv = key.split('self_attn.')[1][:1]
+        for j in range(factor):
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{qkv}.bias.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     # elif (
     #         key.find("SelfAttention.o.weight") != -1

From e665612f1582e47c67eb1e3fbbc86ae9665fcb50 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 21:45:01 -0700
Subject: [PATCH 037/262] commit

---
 .vscode/settings.json                         | 20 +++++-
 .../models/bart/BartEncoderLayerWeight.cc     | 72 ++++++++++++++++++-
 .../models/bart/BartEncoderWeight.cc          |  2 +-
 3 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6f535da99..efbb5fbf3 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -67,6 +67,22 @@
         "unordered_set": "cpp",
         "future": "cpp",
         "cfenv": "cpp",
-        "typeindex": "cpp"
+        "typeindex": "cpp",
+        "__bit_reference": "cpp",
+        "__config": "cpp",
+        "__debug": "cpp",
+        "__errc": "cpp",
+        "__hash_table": "cpp",
+        "__locale": "cpp",
+        "__mutex_base": "cpp",
+        "__node_handle": "cpp",
+        "__split_buffer": "cpp",
+        "__threading_support": "cpp",
+        "__tree": "cpp",
+        "__verbose_abort": "cpp",
+        "charconv": "cpp",
+        "ios": "cpp",
+        "locale": "cpp",
+        "variant": "cpp"
     }
-}
\ No newline at end of file
+}
diff --git a/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc b/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc
index 579e8aec3..4dff65721 100644
--- a/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc
@@ -293,7 +293,77 @@ void BartEncoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
 {
     FT_LOG_DEBUG("BartEncoderLayerWeight " + std::string(__func__) + " start");
 
-    FT_LOG_DEBUG("Megatron BART support TBD");
+    const auto tp_rank = std::to_string(tensor_para_rank_);
+    loadWeightFromBin<T>(weights_ptr_[0],
+                         {weights_size_[0]},
+                         dir_path + "layer.SelfAttention.q.weight." + tp_rank + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[1],
+                         {weights_size_[1]},
+                         dir_path + "layer.SelfAttention.k.weight." + tp_rank + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[2],
+                         {weights_size_[2]},
+                         dir_path + "layer.SelfAttention.v.weight." + tp_rank + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[3],
+                         {weights_size_[3]},
+                         dir_path + "layer.SelfAttention.out_proj.weight." + tp_rank + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[4],
+                         {weights_size_[4]},
+                         dir_path + "layer.SelfAttention.attn_layer_norm.weight.bin",
+                         model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr_[5],
+                        {weights_size_[5]},
+                        dir_path + "layer.SelfAttention.fc1.weight.bin",
+                        model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[6],
+                        {weights_size_[6]},
+                        dir_path + "layer.SelfAttention.fc2.weight.bin",
+                        model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[7],
+                        {weights_size_[7]},
+                        dir_path + "layer.SelfAttention.final_layer_norm.weight.bin",
+                        model_file_type);
+
+    if (bart_with_bias_) {
+        loadWeightFromBin<T>(weights_ptr_[8],
+                            {weights_size_[8]},
+                            dir_path + "layer.SelfAttention.q.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[9],
+                            {weights_size_[9]},
+                            dir_path + "layer.SelfAttention.k.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[10],
+                            {weights_size_[10]},
+                            dir_path + "layer.SelfAttention.v.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[11],
+                            {weights_size_[11]},
+                            dir_path + "layer.SelfAttention.out_proj.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[12],
+                            {weights_size_[12]},
+                            dir_path + "layer.SelfAttention.attn_layer_norm.bias.bin",
+                            model_file_type);
+
+        loadWeightFromBin<T>(weights_ptr_[13],
+                            {weights_size_[13]},
+                            dir_path + "layer.SelfAttention.fc1.bias.bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[14],
+                            {weights_size_[14]},
+                            dir_path + "layer.SelfAttention.fc2.bias.bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[15],
+                            {weights_size_[15]},
+                            dir_path + "layer.SelfAttention.final_layer_norm.bias.bin",
+                            model_file_type);       
+    }
+
 
     FT_LOG_DEBUG("BartEncoderLayerWeight " + std::string(__func__) + " end");
 }
diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index 94d97d1ab..b20fdeeef 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -270,7 +270,7 @@ void BartEncoderWeight<T>::loadModel(std::string dir_path)
 
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
-            bart_encoder_layer_weights[l]->loadModel(dir_path + "/encoder.block." + std::to_string(l) + ".",
+            bart_encoder_layer_weights[l]->loadModel(dir_path + "/encoder." + std::to_string(l) + ".",
                                                    model_file_type);
         }
     }

From 2490c02da4f33606eac7dbf0c0ac58291ca15c27 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 21:52:27 -0700
Subject: [PATCH 038/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index ded82e7d4..593f186ff 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -110,6 +110,42 @@ def split_and_convert_process(key, val, factor, saved_dir):
         for j in range(factor):
             saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{qkv}.bias.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
+    elif key.find("self_attn.out_proj.weight") != -1:
+        split_vals = np.split(val, factor, axis=0)
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.self_attn')[0])
+        for j in range(factor):
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.out_proj.weight.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif key.find("self_attn.out_proj.bias") != -1:
+        split_vals = np.split(val, factor, axis=0)
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.self_attn')[0])
+        for j in range(factor):
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.out_proj.bias.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif key.find("self_attn_layer_norm.weight") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.self_attn')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.weight.bin"
+        val.tofile(saved_path.as_posix())
+    elif key.find("self_attn_layer_norm.bias") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.self_attn')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.bias.bin"
+        val.tofile(saved_path.as_posix())
     # elif (
     #         key.find("SelfAttention.o.weight") != -1
     #         or key.find("EncDecAttention.o.weight") != -1

From 6d126c249afd037256fb2ff4bcd3975434a27534 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 21:57:09 -0700
Subject: [PATCH 039/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 593f186ff..0e8a8e33a 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -246,7 +246,7 @@ def convert_checkpoint(args):
     pool.close()
     pool.join()
 
-    fuse_decoder_qkv(bart_model, i_gpu_num, saved_dir, np_weight_data_type)
+    # fuse_decoder_qkv(bart_model, i_gpu_num, saved_dir, np_weight_data_type)
 
 
 if __name__ == "__main__":

From bff97b1c18f8f6be1ad7e55f025061352d2813bb Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 22:00:45 -0700
Subject: [PATCH 040/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 0e8a8e33a..6fa611e49 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -71,7 +71,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
     if key.find("encoder.embed_positions.weight") != -1:
         saved_path = saved_dir / "encoder.embed_positions.weight.bin"
         val[2:, :].tofile(saved_path.as_posix())
-    if key.find("encoder.embed_tokens.weight") != -1:
+    elif key.find("encoder.embed_tokens.weight") != -1:
         saved_path = saved_dir / "encoder.embed_tokens.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("encoder.layernorm_embedding.weight") != -1:
@@ -203,7 +203,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
             key.find("decoder.embed_tokens.weight") != -1:
         LOGGER.warning(f"Not save {key}, using shared.weight directly.")
     else:
-        LOGGER.warning(f"cannot find key '{key}' with shape {val.shape}")
+        LOGGER.warning(f"Not save '{key}' with shape {val.shape}")
 
 
 def convert_checkpoint(args):

From a089a2f920bdb6e68e32af9d861b8febddee3a25 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 22:03:28 -0700
Subject: [PATCH 041/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 6fa611e49..d29b23675 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -146,6 +146,54 @@ def split_and_convert_process(key, val, factor, saved_dir):
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
+    elif key.find("fc1.weight") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.fc1.weight')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc1.weight.bin"
+        val.tofile(saved_path.as_posix())
+    elif key.find("fc1.bias") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.fc1.bias')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc1.bias.bin"
+        val.tofile(saved_path.as_posix())
+    elif key.find("fc2.weight") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.fc2.weight')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc2.weight.bin"
+        val.tofile(saved_path.as_posix())
+    elif key.find("fc1.bias") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.fc2.bias')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc2.bias.bin"
+        val.tofile(saved_path.as_posix())
+    elif key.find("self_attn_layer_norm.weight") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.self_attn')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.weight.bin"
+        val.tofile(saved_path.as_posix())
+    elif key.find("self_attn_layer_norm.bias") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.self_attn')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.bias.bin"
+        val.tofile(saved_path.as_posix())
     # elif (
     #         key.find("SelfAttention.o.weight") != -1
     #         or key.find("EncDecAttention.o.weight") != -1

From ec035dfa655aa95e6edfbcb533010a65ad769d98 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 22:07:05 -0700
Subject: [PATCH 042/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index d29b23675..919d00270 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -151,7 +151,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
             prefix = "encoder"
         else:
             prefix = "decoder"
-        layer = int(key.split('layers.')[1].split('.fc1.weight')[0])
+        layer = int(key.split('layers.')[1].split('.fc1.')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc1.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("fc1.bias") != -1:
@@ -159,7 +159,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
             prefix = "encoder"
         else:
             prefix = "decoder"
-        layer = int(key.split('layers.')[1].split('.fc1.bias')[0])
+        layer = int(key.split('layers.')[1].split('.fc1.')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc1.bias.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("fc2.weight") != -1:
@@ -167,7 +167,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
             prefix = "encoder"
         else:
             prefix = "decoder"
-        layer = int(key.split('layers.')[1].split('.fc2.weight')[0])
+        layer = int(key.split('layers.')[1].split('.fc2.')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc2.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("fc1.bias") != -1:
@@ -175,24 +175,24 @@ def split_and_convert_process(key, val, factor, saved_dir):
             prefix = "encoder"
         else:
             prefix = "decoder"
-        layer = int(key.split('layers.')[1].split('.fc2.bias')[0])
+        layer = int(key.split('layers.')[1].split('.fc2.')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc2.bias.bin"
         val.tofile(saved_path.as_posix())
-    elif key.find("self_attn_layer_norm.weight") != -1:
+    elif key.find("final_layer_norm.weight") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"
         else:
             prefix = "decoder"
-        layer = int(key.split('layers.')[1].split('.self_attn')[0])
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.weight.bin"
+        layer = int(key.split('layers.')[1].split('.final_layer_norm.')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.final_layer_norm.weight.bin"
         val.tofile(saved_path.as_posix())
-    elif key.find("self_attn_layer_norm.bias") != -1:
+    elif key.find("final_layer_norm.bias") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"
         else:
             prefix = "decoder"
-        layer = int(key.split('layers.')[1].split('.self_attn')[0])
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.bias.bin"
+        layer = int(key.split('layers.')[1].split('.final_layer_norm.')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.final_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
     # elif (
     #         key.find("SelfAttention.o.weight") != -1

From 0e3fe4d4e327ab4aa01375eb1021df3af7e27923 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 22:08:16 -0700
Subject: [PATCH 043/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 919d00270..771ed88d6 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -170,7 +170,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
         layer = int(key.split('layers.')[1].split('.fc2.')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc2.weight.bin"
         val.tofile(saved_path.as_posix())
-    elif key.find("fc1.bias") != -1:
+    elif key.find("fc2.bias") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"
         else:

From 83668eaad8e723ee1e56fdb7097c5bbb7ef4cd9b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 22:10:48 -0700
Subject: [PATCH 044/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py     | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 771ed88d6..87df7af67 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -68,11 +68,19 @@ def split_and_convert_process(key, val, factor, saved_dir):
         val = val.transpose(1, 0)
     LOGGER.debug(f"key: {key}, val.shape: {val.shape}")
 
-    if key.find("encoder.embed_positions.weight") != -1:
-        saved_path = saved_dir / "encoder.embed_positions.weight.bin"
-        val[2:, :].tofile(saved_path.as_posix())
-    elif key.find("encoder.embed_tokens.weight") != -1:
-        saved_path = saved_dir / "encoder.embed_tokens.weight.bin"
+    if key.find(".embed_positions.weight") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        saved_path = saved_dir / f"{prefix}.embed_positions.weight.bin"
+        val[:, 2].tofile(saved_path.as_posix())
+    elif key.find(".embed_tokens.weight") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        saved_path = saved_dir / f"{prefix}.embed_tokens.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("encoder.layernorm_embedding.weight") != -1:
         saved_path = saved_dir / "encoder.final_layer_norm.weight.bin"

From 432b3e49513b82849d984a1aedd9ed6315bbc17f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 22:14:33 -0700
Subject: [PATCH 045/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 87df7af67..a546c077a 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -154,6 +154,72 @@ def split_and_convert_process(key, val, factor, saved_dir):
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
+    elif (
+        key.find("encoder_attn.k_proj.weight") != -1
+        or key.find("encoder_attn.v_proj.weight") != -1
+        or key.find("encoder_attn.q_proj.weight") != -1
+    ):
+        split_vals = np.split(val, factor, axis=0)
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
+        qkv = key.split('encoder_attn.')[1][:1]
+        for j in range(factor):
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{qkv}.weight.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif (
+        key.find("encoder_attn.k_proj.bias") != -1
+        or key.find("encoder_attn.v_proj.bias") != -1
+        or key.find("encoder_attn.q_proj.bias") != -1
+    ):
+        split_vals = np.split(val, factor, axis=0)
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
+        qkv = key.split('encoder_attn.')[1][:1]
+        for j in range(factor):
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{qkv}.bias.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif key.find("encoder_attn.out_proj.weight") != -1:
+        split_vals = np.split(val, factor, axis=0)
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
+        for j in range(factor):
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.out_proj.weight.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif key.find("encoder_attn.out_proj.bias") != -1:
+        split_vals = np.split(val, factor, axis=0)
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
+        for j in range(factor):
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.out_proj.bias.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif key.find("encoder_attn_layer_norm.weight") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.weight.bin"
+        val.tofile(saved_path.as_posix())
+    elif key.find("encoder_attn_layer_norm.bias") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.bias.bin"
+        val.tofile(saved_path.as_posix())
     elif key.find("fc1.weight") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"

From 67bc7a5ef0b791b10074aaf2617cb2e7c1821697 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 22:15:45 -0700
Subject: [PATCH 046/262] commit

---
 .../bart/utils/huggingface_bart_ckpt_convert.py  | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index a546c077a..d520c2f93 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -82,11 +82,19 @@ def split_and_convert_process(key, val, factor, saved_dir):
             prefix = "decoder"
         saved_path = saved_dir / f"{prefix}.embed_tokens.weight.bin"
         val.tofile(saved_path.as_posix())
-    elif key.find("encoder.layernorm_embedding.weight") != -1:
-        saved_path = saved_dir / "encoder.final_layer_norm.weight.bin"
+    elif key.find(".layernorm_embedding.weight") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        saved_path = saved_dir / f"{prefix}.final_layer_norm.weight.bin"
         val.tofile(saved_path.as_posix())
-    elif key.find("encoder.layernorm_embedding.bias") != -1:
-        saved_path = saved_dir / "encoder.final_layer_norm.bias.bin"
+    elif key.find(".layernorm_embedding.bias") != -1:
+        if key.find("encoder") != -1:
+            prefix = "encoder"
+        else:
+            prefix = "decoder"
+        saved_path = saved_dir / f"{prefix}.final_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
     elif (
         key.find("self_attn.k_proj.weight") != -1

From 6405e9757bc19bf29ed8eb3a4d20529224dec8d2 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 23:08:34 -0700
Subject: [PATCH 047/262] commit

---
 .../models/bart/BartDecodingWeight.cc         | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecodingWeight.cc b/src/fastertransformer/models/bart/BartDecodingWeight.cc
index 7789eb0cf..8f73140f2 100644
--- a/src/fastertransformer/models/bart/BartDecodingWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecodingWeight.cc
@@ -256,8 +256,27 @@ void BartDecodingWeight<T>::loadModel(std::string dir_path)
 {
     FT_LOG_DEBUG("BartDecodingWeight " + std::string(__func__) + " start");
 
-    FT_LOG_DEBUG(
-        "Currently only support checkpoint loading from PyTorch interface outside FT. Direct checkpoint .bin loading support TBD");
+    // 8: [0] absolute/relative positional embedding weight [1] word embedding weight [2] word embedding 2 weight [3]
+    // pre-LN weight [4] post-LN weight [5] pre-LN bias [6] post-LN bias [7] word embedding 2 bias. Assuming both mBART
+    // and bias
+    loadWeightFromBin<T>(weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/decoder.embed_positions.weight.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/decoder.embed_tokens.weight.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[2], {(size_t)weights_size[2]}, dir_path + "/decoder.lm_head.weight.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[3], {(size_t)weights_size[3]}, dir_path + "/decoder.final_layer_norm.weight.bin", model_file_type);
+    if (bart_with_bias) {
+        loadWeightFromBin<T>(weights_ptr[4],
+                             {(size_t)weights_size[4]},
+                             dir_path + "/decoder.final_layer_norm.bias.bin",
+                             model_file_type);
+    }
+
+    for (int l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l)) {
+            bart_encoder_layer_weights[l]->loadModel(dir_path + "/decoder." + std::to_string(l) + ".",
+                                                   model_file_type);
+        }
+    }
 
     FT_LOG_DEBUG("BartDecodingWeight " + std::string(__func__) + " end");
 }

From 9468a5abec8b7f614854b3607f918453c1cdf8e3 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 23:16:29 -0700
Subject: [PATCH 048/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 59 ++-----------------
 .../models/bart/BartDecodingWeight.cc         |  2 +
 .../models/bart/BartEncoderWeight.cc          |  5 --
 3 files changed, 8 insertions(+), 58 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index d520c2f93..b55d3c476 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -276,59 +276,12 @@ def split_and_convert_process(key, val, factor, saved_dir):
         layer = int(key.split('layers.')[1].split('.final_layer_norm.')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.final_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
-    # elif (
-    #         key.find("SelfAttention.o.weight") != -1
-    #         or key.find("EncDecAttention.o.weight") != -1
-    #         or key.find("DenseReluDense.wo.weight") != -1
-    # ):
-    #     split_vals = np.split(val, factor, axis=0)
-    #     for j in range(factor):
-    #         saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
-    #         split_vals[j].tofile(saved_path.as_posix())
-
-    # elif (
-    #         key.find("DenseReluDense.wi.weight") != -1
-    #         or (key.find("encoder") != -1 and (
-    #         key.find("SelfAttention.q.weight") != -1
-    #         or key.find("SelfAttention.k.weight") != -1
-    #         or key.find("SelfAttention.v.weight") != -1
-    # )
-    #         )
-    #         or key.find("EncDecAttention.q.weight") != -1
-    #         or key.find("EncDecAttention.k.weight") != -1
-    #         or key.find("EncDecAttention.v.weight") != -1
-    # ):
-    #     split_vals = np.split(val, factor, axis=-1)
-    #     for j in range(factor):
-    #         saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
-    #         split_vals[j].tofile(saved_path.as_posix())
-    # elif (
-    #         key.find("DenseReluDense.wi_0.weight") != -1
-    #         or key.find("DenseReluDense.wi_1.weight") != -1
-    # ):
-    #     # For gated activation.
-    #     if key.find("DenseReluDense.wi_0.weight") != -1:
-    #         saved_key = key.replace("wi_0", "wi")
-    #     elif key.find("DenseReluDense.wi_1.weight") != -1:
-    #         saved_key = key.replace("wi_1", "wi2")
-    #     split_vals = np.split(val, factor, axis=-1)
-    #     for j in range(factor):
-    #         saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
-    #         split_vals[j].tofile(saved_path.as_posix())
-    # elif key.find("relative_attention_bias") != -1:
-    #     split_vals = np.split(val, factor, axis=0)
-    #     for j in range(factor):
-    #         saved_path = saved_dir / f"{saved_key}.{j:d}.bin"
-    #         split_vals[j].tofile(saved_path.as_posix())
-    # elif (
-    #         key.find("decoder") != -1 and
-    #         (
-    #                 key.find("SelfAttention.q.weight") != -1
-    #                 or key.find("SelfAttention.k.weight") != -1
-    #                 or key.find("SelfAttention.v.weight") != -1
-    #         )
-    # ):
-    #     pass
+    elif key.find("lm_head.weight") != -1:
+        saved_path = saved_dir / "decoder.lm_head.weight.bin"
+        val.tofile(saved_path.as_posix())
+    elif key.find("final_logits_bias") != -1:
+        saved_path = saved_dir / "decoder.final_logits.bias"
+        val.tofile(saved_path.as_posix())
     elif key.find("encoder.embed_tokens.weight") != -1 or \
             key.find("decoder.embed_tokens.weight") != -1:
         LOGGER.warning(f"Not save {key}, using shared.weight directly.")
diff --git a/src/fastertransformer/models/bart/BartDecodingWeight.cc b/src/fastertransformer/models/bart/BartDecodingWeight.cc
index 8f73140f2..f31c8eaa2 100644
--- a/src/fastertransformer/models/bart/BartDecodingWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecodingWeight.cc
@@ -264,6 +264,8 @@ void BartDecodingWeight<T>::loadModel(std::string dir_path)
     loadWeightFromBin<T>(weights_ptr[2], {(size_t)weights_size[2]}, dir_path + "/decoder.lm_head.weight.bin", model_file_type);
     loadWeightFromBin<T>(
         weights_ptr[3], {(size_t)weights_size[3]}, dir_path + "/decoder.final_layer_norm.weight.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[5], {(size_t)weights_size[5]}, dir_path + "/decoder.final_layer_norm.weight.bin", model_file_type);
     if (bart_with_bias) {
         loadWeightFromBin<T>(weights_ptr[4],
                              {(size_t)weights_size[4]},
diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index b20fdeeef..b4b8a4772 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -252,11 +252,6 @@ void BartEncoderWeight<T>::loadModel(std::string dir_path)
     FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "encoder");
     FT_CHECK(is_maintain_buffer == true);
 
-    /*
-        // 6: [0] absolute/relative positional embedding weight [1] word embedding weight [2] pre-LN weight [3] post-LN
-    // weight [4] pre-LN bias[5] post-LN bias. Assuming both mBART and bias
-    */
-
     loadWeightFromBin<T>(weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/encoder.embed_positions.weight.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/encoder.embed_tokens.weight.bin", model_file_type);
     loadWeightFromBin<T>(

From b9f6b3b76f5d6e61fbfeb1c632132d3cc524f5b2 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 23:16:51 -0700
Subject: [PATCH 049/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index b55d3c476..3ded1e6d8 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -280,7 +280,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
         saved_path = saved_dir / "decoder.lm_head.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("final_logits_bias") != -1:
-        saved_path = saved_dir / "decoder.final_logits.bias"
+        saved_path = saved_dir / "decoder.final_logits_bias.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("encoder.embed_tokens.weight") != -1 or \
             key.find("decoder.embed_tokens.weight") != -1:

From 5facbd0111bd2113d3f951210c4712faf4a665ec Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 23:19:03 -0700
Subject: [PATCH 050/262] commit

---
 src/fastertransformer/models/bart/BartDecodingWeight.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecodingWeight.cc b/src/fastertransformer/models/bart/BartDecodingWeight.cc
index f31c8eaa2..af7af3b9d 100644
--- a/src/fastertransformer/models/bart/BartDecodingWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecodingWeight.cc
@@ -256,21 +256,17 @@ void BartDecodingWeight<T>::loadModel(std::string dir_path)
 {
     FT_LOG_DEBUG("BartDecodingWeight " + std::string(__func__) + " start");
 
-    // 8: [0] absolute/relative positional embedding weight [1] word embedding weight [2] word embedding 2 weight [3]
-    // pre-LN weight [4] post-LN weight [5] pre-LN bias [6] post-LN bias [7] word embedding 2 bias. Assuming both mBART
-    // and bias
     loadWeightFromBin<T>(weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/decoder.embed_positions.weight.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/decoder.embed_tokens.weight.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[2], {(size_t)weights_size[2]}, dir_path + "/decoder.lm_head.weight.bin", model_file_type);
     loadWeightFromBin<T>(
         weights_ptr[3], {(size_t)weights_size[3]}, dir_path + "/decoder.final_layer_norm.weight.bin", model_file_type);
-    loadWeightFromBin<T>(
-        weights_ptr[5], {(size_t)weights_size[5]}, dir_path + "/decoder.final_layer_norm.weight.bin", model_file_type);
     if (bart_with_bias) {
         loadWeightFromBin<T>(weights_ptr[4],
                              {(size_t)weights_size[4]},
                              dir_path + "/decoder.final_layer_norm.bias.bin",
                              model_file_type);
+        loadWeightFromBin<T>(weights_ptr[5], {(size_t)weights_size[5]}, dir_path + "/decoder.final_logits_bias.bin", model_file_type);
     }
 
     for (int l = 0; l < num_layer_; l++) {

From 4eb759b4aa6974a4465922e1e89e99a27a8775ee Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 18 Sep 2023 23:20:04 -0700
Subject: [PATCH 051/262] commit

---
 src/fastertransformer/models/bart/BartDecodingWeight.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecodingWeight.cc b/src/fastertransformer/models/bart/BartDecodingWeight.cc
index af7af3b9d..cd67625d5 100644
--- a/src/fastertransformer/models/bart/BartDecodingWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecodingWeight.cc
@@ -271,7 +271,7 @@ void BartDecodingWeight<T>::loadModel(std::string dir_path)
 
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
-            bart_encoder_layer_weights[l]->loadModel(dir_path + "/decoder." + std::to_string(l) + ".",
+            decoder_layer_weights[l]->loadModel(dir_path + "/decoder." + std::to_string(l) + ".",
                                                    model_file_type);
         }
     }

From 7fadeba2873f831be723240790874eed81b03d1f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 17:33:23 -0700
Subject: [PATCH 052/262] commit

---
 .../models/bart/BartDecoderLayerWeight.cc     | 72 ++++++++++++++++++-
 1 file changed, 70 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index 3b17c7317..7c0725d55 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -274,8 +274,76 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
 {
     FT_LOG_DEBUG("BartDecoderLayerWeight " + std::string(__func__) + " start");
 
-    FT_LOG_DEBUG(
-        "Currently only support checkpoint loading from PyTorch interface outside FT. Direct checkpoint .bin loading support TBD");
+    const auto tp_rank = std::to_string(tensor_para_rank_);
+    loadWeightFromBin<T>(weights_ptr_[0],
+                         {weights_size_[0]},
+                         dir_path + "layer.SelfAttention.q.weight." + tp_rank + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[1],
+                         {weights_size_[1]},
+                         dir_path + "layer.SelfAttention.k.weight." + tp_rank + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[2],
+                         {weights_size_[2]},
+                         dir_path + "layer.SelfAttention.v.weight." + tp_rank + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[3],
+                         {weights_size_[3]},
+                         dir_path + "layer.SelfAttention.out_proj.weight." + tp_rank + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[4],
+                         {weights_size_[4]},
+                         dir_path + "layer.SelfAttention.attn_layer_norm.weight.bin",
+                         model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr_[5],
+                        {weights_size_[5]},
+                        dir_path + "layer.SelfAttention.fc1.weight.bin",
+                        model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[6],
+                        {weights_size_[6]},
+                        dir_path + "layer.SelfAttention.fc2.weight.bin",
+                        model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[7],
+                        {weights_size_[7]},
+                        dir_path + "layer.SelfAttention.final_layer_norm.weight.bin",
+                        model_file_type);
+
+    if (bart_with_bias_) {
+        loadWeightFromBin<T>(weights_ptr_[8],
+                            {weights_size_[8]},
+                            dir_path + "layer.SelfAttention.q.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[9],
+                            {weights_size_[9]},
+                            dir_path + "layer.SelfAttention.k.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[10],
+                            {weights_size_[10]},
+                            dir_path + "layer.SelfAttention.v.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[11],
+                            {weights_size_[11]},
+                            dir_path + "layer.SelfAttention.out_proj.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[12],
+                            {weights_size_[12]},
+                            dir_path + "layer.SelfAttention.attn_layer_norm.bias.bin",
+                            model_file_type);
+
+        loadWeightFromBin<T>(weights_ptr_[13],
+                            {weights_size_[13]},
+                            dir_path + "layer.SelfAttention.fc1.bias.bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[14],
+                            {weights_size_[14]},
+                            dir_path + "layer.SelfAttention.fc2.bias.bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr_[15],
+                            {weights_size_[15]},
+                            dir_path + "layer.SelfAttention.final_layer_norm.bias.bin",
+                            model_file_type);       
+    }
 
     FT_LOG_DEBUG("BartDecoderLayerWeight " + std::string(__func__) + " end");
 }

From 889455e72a926a8a22d17adcb3fecfe0e0813b6a Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 19:37:41 -0700
Subject: [PATCH 053/262] commit

---
 .../models/bart/BartDecoderLayerWeight.cc        | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index 7c0725d55..17bafa44e 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -275,13 +275,25 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
     FT_LOG_DEBUG("BartDecoderLayerWeight " + std::string(__func__) + " start");
 
     const auto tp_rank = std::to_string(tensor_para_rank_);
+
+    layernorm_weights.gamma                               = weights_ptr[0];
+    self_attention_weights.query_weight.kernel            = weights_ptr[1];
+    self_attention_weights.attention_output_weight.kernel = weights_ptr[2];
+    self_attn_layernorm_weights.gamma                     = weights_ptr[3];
+
+    cross_attention_weights.query_weight.kernel            = weights_ptr[4];
+    cross_attention_weights.key_weight.kernel              = weights_ptr[5];
+    cross_attention_weights.value_weight.kernel            = weights_ptr[6];
+    cross_attention_weights.attention_output_weight.kernel = weights_ptr[7];
+    cross_attn_layernorm_weights.gamma                     = weights_ptr[8];
+
     loadWeightFromBin<T>(weights_ptr_[0],
                          {weights_size_[0]},
-                         dir_path + "layer.SelfAttention.q.weight." + tp_rank + ".bin",
+                         dir_path + "layer.SelfAttention.attn_layer_norm.weight." + tp_rank + ".bin",
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr_[1],
                          {weights_size_[1]},
-                         dir_path + "layer.SelfAttention.k.weight." + tp_rank + ".bin",
+                         dir_path + "layer.SelfAttention.q.weight." + tp_rank + ".bin",
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr_[2],
                          {weights_size_[2]},

From 2b2f3bd97a33a60ff986e06652a93379a5e5f551 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 19:38:37 -0700
Subject: [PATCH 054/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 3ded1e6d8..ef38c78a4 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -175,7 +175,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
         qkv = key.split('encoder_attn.')[1][:1]
         for j in range(factor):
-            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{qkv}.weight.{j:d}.bin"
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.CrossAttention.{qkv}.weight.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif (
         key.find("encoder_attn.k_proj.bias") != -1

From b578e43aa7cfb14705cf788fc4846e7e803378e2 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 19:51:21 -0700
Subject: [PATCH 055/262] commit

---
 .../models/bart/BartDecoderLayerWeight.cc     | 28 +++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index 17bafa44e..3c0e95909 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -289,29 +289,41 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
 
     loadWeightFromBin<T>(weights_ptr_[0],
                          {weights_size_[0]},
-                         dir_path + "layer.SelfAttention.attn_layer_norm.weight." + tp_rank + ".bin",
+                         dir_path + "layer.SelfAttention.final_layer_norm.weight." + tp_rank + ".bin",
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr_[1],
                          {weights_size_[1]},
-                         dir_path + "layer.SelfAttention.q.weight." + tp_rank + ".bin",
+                         dir_path + "layer.SelfAttention.qkv.weight." + tp_rank + ".bin",
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr_[2],
                          {weights_size_[2]},
-                         dir_path + "layer.SelfAttention.v.weight." + tp_rank + ".bin",
+                         dir_path + "layer.SelfAttention.out_proj.weight." + tp_rank + ".bin",
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr_[3],
                          {weights_size_[3]},
-                         dir_path + "layer.SelfAttention.out_proj.weight." + tp_rank + ".bin",
+                         dir_path + "layer.SelfAttention.attn_layer_norm.weight." + tp_rank + ".bin",
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr_[4],
                          {weights_size_[4]},
-                         dir_path + "layer.SelfAttention.attn_layer_norm.weight.bin",
+                         dir_path + "layer.CrossAttention.q.weight." + tp_rank + ".bin",
                          model_file_type);
-
     loadWeightFromBin<T>(weights_ptr_[5],
-                        {weights_size_[5]},
-                        dir_path + "layer.SelfAttention.fc1.weight.bin",
+                         {weights_size_[5]},
+                         dir_path + "layer.CrossAttention.k.weight." + tp_rank + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[6],
+                         {weights_size_[6]},
+                         dir_path + "layer.CrossAttention.v.weight." + tp_rank + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[7],
+                         {weights_size_[7]},
+                         dir_path + "layer.CrossAttention.attn_layer_norm.weight.bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr_[8],
+                        {weights_size_[8]},
+                        dir_path + "layer.SelfAttention.attn_layer_norm.weight." + tp_rank + ".bin",
                         model_file_type);
+                        
     loadWeightFromBin<T>(weights_ptr_[6],
                         {weights_size_[6]},
                         dir_path + "layer.SelfAttention.fc2.weight.bin",

From e391316e87dafb5a73f324b2556baef9ce295491 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 20:09:16 -0700
Subject: [PATCH 056/262] commit

---
 .../bart/utils/huggingface_bart_ckpt_convert.py      | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index ef38c78a4..c273ccee1 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -213,20 +213,12 @@ def split_and_convert_process(key, val, factor, saved_dir):
             saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.out_proj.bias.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("encoder_attn_layer_norm.weight") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.weight.bin"
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.CrossAttention.attn_layer_norm.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("encoder_attn_layer_norm.bias") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.bias.bin"
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.CrossAttention.attn_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("fc1.weight") != -1:
         if key.find("encoder") != -1:

From 1cbca5949a4ff3c6affaf28c5badb8d1945a9459 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 20:18:02 -0700
Subject: [PATCH 057/262] commit

---
 .../bart/utils/huggingface_bart_ckpt_convert.py      | 12 ++----------
 .../models/bart/BartDecoderLayerWeight.cc            |  6 +++---
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index c273ccee1..28812d327 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -194,23 +194,15 @@ def split_and_convert_process(key, val, factor, saved_dir):
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("encoder_attn.out_proj.weight") != -1:
         split_vals = np.split(val, factor, axis=0)
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
         for j in range(factor):
-            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.out_proj.weight.{j:d}.bin"
+            saved_path = saved_dir / f"decoder.{layer}.layer.CrossAttention.out_proj.weight.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("encoder_attn.out_proj.bias") != -1:
         split_vals = np.split(val, factor, axis=0)
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
         for j in range(factor):
-            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.out_proj.bias.{j:d}.bin"
+            saved_path = saved_dir / f"decoder.{layer}.layer.CrossAttention.out_proj.bias.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("encoder_attn_layer_norm.weight") != -1:
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index 3c0e95909..168a0127d 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -317,13 +317,13 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr_[7],
                          {weights_size_[7]},
-                         dir_path + "layer.CrossAttention.attn_layer_norm.weight.bin",
+                         dir_path + "layer.CrossAttention.out_proj.weight." + tp_rank + ".bin",
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr_[8],
                         {weights_size_[8]},
-                        dir_path + "layer.SelfAttention.attn_layer_norm.weight." + tp_rank + ".bin",
+                        dir_path + "layer.CrossAttention.attn_layer_norm.weight." + tp_rank + ".bin",
                         model_file_type);
-                        
+
     loadWeightFromBin<T>(weights_ptr_[6],
                         {weights_size_[6]},
                         dir_path + "layer.SelfAttention.fc2.weight.bin",

From bbfd31e42d3f2d12b40104c43f1997e8c1d89b38 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 20:27:23 -0700
Subject: [PATCH 058/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 37 ++++++++-----------
 .../models/bart/BartDecoderLayerWeight.cc     | 13 ++++---
 2 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 28812d327..848844a64 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -212,37 +212,32 @@ def split_and_convert_process(key, val, factor, saved_dir):
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.CrossAttention.attn_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
-    elif key.find("fc1.weight") != -1:
+    elif key.find("fc1.weight") != -1 or key.find("fc2.weight") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"
         else:
             prefix = "decoder"
-        layer = int(key.split('layers.')[1].split('.fc1.')[0])
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc1.weight.bin"
-        val.tofile(saved_path.as_posix())
-    elif key.find("fc1.bias") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
+        split_vals = np.split(val, factor, axis=0)
+        if key.find("fc1.weight") != -1:
+            fc = 'fc1'
         else:
-            prefix = "decoder"
-        layer = int(key.split('layers.')[1].split('.fc1.')[0])
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc1.bias.bin"
-        val.tofile(saved_path.as_posix())
-    elif key.find("fc2.weight") != -1:
+            fc = 'fc2'
+        layer = int(key.split('layers.')[1].split(f'.{fc}.')[0])
+        for j in range(factor):
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{fc}.weight.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+    elif key.find("fc1.bias") != -1 or key.find("fc2.bias") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"
         else:
             prefix = "decoder"
-        layer = int(key.split('layers.')[1].split('.fc2.')[0])
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc2.weight.bin"
-        val.tofile(saved_path.as_posix())
-    elif key.find("fc2.bias") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
+        if key.find("fc1.weight") != -1:
+            fc = 'fc1'
         else:
-            prefix = "decoder"
-        layer = int(key.split('layers.')[1].split('.fc2.')[0])
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.fc2.bias.bin"
+            fc = 'fc2'
+        layer = int(key.split('layers.')[1].split(f'.{fc}.')[0])
+        split_vals = np.split(val, factor, axis=0)
+        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{fc}.bias.{j:d}.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("final_layer_norm.weight") != -1:
         if key.find("encoder") != -1:
diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index 168a0127d..6e5eb4dea 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -321,15 +321,18 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr_[8],
                         {weights_size_[8]},
-                        dir_path + "layer.CrossAttention.attn_layer_norm.weight." + tp_rank + ".bin",
+                        dir_path + "layer.CrossAttention.attn_layer_norm.weight.bin",
                         model_file_type);
 
-    loadWeightFromBin<T>(weights_ptr_[6],
-                        {weights_size_[6]},
+        ffn_weights.intermediate_weight.kernel = weights_ptr[9];
+        ffn_weights.output_weight.kernel       = weights_ptr[10];
+
+    loadWeightFromBin<T>(weights_ptr_[9],
+                        {weights_size_[9]},
                         dir_path + "layer.SelfAttention.fc2.weight.bin",
                         model_file_type);
-    loadWeightFromBin<T>(weights_ptr_[7],
-                        {weights_size_[7]},
+    loadWeightFromBin<T>(weights_ptr_[10],
+                        {weights_size_[10]},
                         dir_path + "layer.SelfAttention.final_layer_norm.weight.bin",
                         model_file_type);
 

From de43b4d121b17aea15b63765c04003ad2e800efb Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 20:35:10 -0700
Subject: [PATCH 059/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 848844a64..04c44f484 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -231,7 +231,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
             prefix = "encoder"
         else:
             prefix = "decoder"
-        if key.find("fc1.weight") != -1:
+        if key.find("fc1.bias") != -1:
             fc = 'fc1'
         else:
             fc = 'fc2'

From cf6ae4f0e9c4ccebbe723e45b1f43d52302a0916 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 20:37:04 -0700
Subject: [PATCH 060/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 04c44f484..90e4b2e54 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -64,6 +64,7 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
 
 
 def split_and_convert_process(key, val, factor, saved_dir):
+    print(key)
     if val.ndim == 2:
         val = val.transpose(1, 0)
     LOGGER.debug(f"key: {key}, val.shape: {val.shape}")

From ddc6e06f75cbb2f8c1cc18ab98670f6d25e10458 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 20:40:47 -0700
Subject: [PATCH 061/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 90e4b2e54..cd99d4325 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -219,7 +219,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
         else:
             prefix = "decoder"
         split_vals = np.split(val, factor, axis=0)
-        if key.find("fc1.weight") != -1:
+        if key.find("fc1.") != -1:
             fc = 'fc1'
         else:
             fc = 'fc2'
@@ -227,7 +227,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
         for j in range(factor):
             saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{fc}.weight.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
-    elif key.find("fc1.bias") != -1 or key.find("fc2.bias") != -1:
+    elif key.find("fc1.") != -1 or key.find("fc2.bias") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"
         else:

From fe899c2f5bccfab81aaf7369969072a28910d827 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 20:41:04 -0700
Subject: [PATCH 062/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index cd99d4325..d1630db05 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -227,12 +227,12 @@ def split_and_convert_process(key, val, factor, saved_dir):
         for j in range(factor):
             saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{fc}.weight.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
-    elif key.find("fc1.") != -1 or key.find("fc2.bias") != -1:
+    elif key.find("fc1.bias") != -1 or key.find("fc2.bias") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"
         else:
             prefix = "decoder"
-        if key.find("fc1.bias") != -1:
+        if key.find("fc1.") != -1:
             fc = 'fc1'
         else:
             fc = 'fc2'

From 0c9668aea3b0d160d0c496a0db0a28ef69fe2a0d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 20:44:37 -0700
Subject: [PATCH 063/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index d1630db05..0d33dd91b 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -238,8 +238,9 @@ def split_and_convert_process(key, val, factor, saved_dir):
             fc = 'fc2'
         layer = int(key.split('layers.')[1].split(f'.{fc}.')[0])
         split_vals = np.split(val, factor, axis=0)
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{fc}.bias.{j:d}.bin"
-        val.tofile(saved_path.as_posix())
+        for j in range(factor):
+            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{fc}.bias.{j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
     elif key.find("final_layer_norm.weight") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"

From f1ea8ca4d249dfb8590bb377c7aa9b73f1eb898f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 20:59:21 -0700
Subject: [PATCH 064/262] commit

---
 examples/cpp/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index da24d72c6..efacc9c7d 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+add_subdirectory(bart)
 add_subdirectory(bert)
 add_subdirectory(bert_int8)
 add_subdirectory(decoding)

From 7dd957beb5cdd68d8daf5969e90c96314bf28b29 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 20:59:58 -0700
Subject: [PATCH 065/262] commit

---
 examples/cpp/bart/CMakeLists.txt            |  18 +
 examples/cpp/bart/bad_words.csv             |   2 +
 examples/cpp/bart/bart_config.ini           |  34 ++
 examples/cpp/bart/bart_triton_example.cc    | 457 ++++++++++++++++++++
 examples/cpp/bart/check_with_huggingface.py |  16 +
 examples/cpp/bart/start_ids.csv             |   8 +
 examples/cpp/bart/stop_words.csv            |   2 +
 7 files changed, 537 insertions(+)
 create mode 100644 examples/cpp/bart/CMakeLists.txt
 create mode 100644 examples/cpp/bart/bad_words.csv
 create mode 100644 examples/cpp/bart/bart_config.ini
 create mode 100644 examples/cpp/bart/bart_triton_example.cc
 create mode 100644 examples/cpp/bart/check_with_huggingface.py
 create mode 100644 examples/cpp/bart/start_ids.csv
 create mode 100644 examples/cpp/bart/stop_words.csv

diff --git a/examples/cpp/bart/CMakeLists.txt b/examples/cpp/bart/CMakeLists.txt
new file mode 100644
index 000000000..5cceacb32
--- /dev/null
+++ b/examples/cpp/bart/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(bart_triton_example bart_triton_example.cc)
+target_link_libraries(bart_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread
+                      BartTritonBackend TransformerTritonBackend custom_ar_comm
+                      gpt_example_utils word_list mpi_utils nccl_utils nvtx_utils)
diff --git a/examples/cpp/bart/bad_words.csv b/examples/cpp/bart/bad_words.csv
new file mode 100644
index 000000000..6a1126ebd
--- /dev/null
+++ b/examples/cpp/bart/bad_words.csv
@@ -0,0 +1,2 @@
+7768,3908
+1,2
diff --git a/examples/cpp/bart/bart_config.ini b/examples/cpp/bart/bart_config.ini
new file mode 100644
index 000000000..ef789d35d
--- /dev/null
+++ b/examples/cpp/bart/bart_config.ini
@@ -0,0 +1,34 @@
+[ft_instance_hyperparameter]
+data_type=fp16
+enable_custom_all_reduce=0
+
+tensor_para_size=1
+pipeline_para_size=1
+
+model_name=llama_7b
+model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/
+
+[request]
+beam_width=1 # beam width for beam search
+top_k=1 ; k value for top k sampling
+top_p=0.0 ; p value for top p sampling
+temperature=1.0 ; Use for sampling
+repetition_penalty=1.0 ; Use for sampling
+presence_penalty=0.0  ; Only one of repetition_penalty and presence_penalty are allowed.
+len_penalty=0.0
+beam_search_diversity_rate=0.0
+request_batch_size=8 # determine by the request
+request_output_len=32 # determine by the request
+
+[llama_7b]
+head_num = 64
+kv_head_num = 8
+size_per_head = 128
+inter_size = 28672
+num_layer = 3
+rotary_embedding = 128
+layernorm_eps = 1e-05
+vocab_size = 32000
+start_id = 1
+end_id = 2
+weight_data_type = fp16
diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
new file mode 100644
index 000000000..3df2a2203
--- /dev/null
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "3rdparty/INIReader.h"
+#include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+#include "src/fastertransformer/utils/word_list.h"
+
+#include <memory>
+#include <thread>
+
+namespace ft = fastertransformer;
+
+struct RequestParam {
+    int                    beam_width;
+    int                    request_output_len;
+    float                  beam_search_diversity_rate;
+    uint                   runtime_top_k;
+    float                  runtime_top_p;
+    float                  temperature;
+    float                  len_penalty;
+    float                  repetition_penalty;
+    float                  presence_penalty;
+    int                    min_length;
+    unsigned long long int random_seed;
+    int                    start_id;
+    int                    end_id;
+};
+
+std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
+broadCastRequest(const std::vector<int>& v_start_ids,
+                 const std::vector<int>& v_start_lengths,
+                 const std::vector<int>& v_bad_words,
+                 const int               node_id,
+                 const int               gpu_count,
+                 const RequestParam      param,
+                 std::vector<void*>*     pointer_record)
+{
+    // broadcast the request to all nodes, and copy "gpu_count" copies on different gpu
+    int size_1         = v_start_ids.size();
+    int size_2         = v_start_lengths.size();
+    int size_bad_words = v_bad_words.size();
+    ft::mpi::bcast(&size_1, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+    ft::mpi::bcast(&size_2, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+    ft::mpi::bcast(&size_bad_words, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+
+    std::vector<int> v_input_ids(size_1);
+    std::vector<int> v_input_lengths(size_2);
+    std::vector<int> v_input_bad_words(size_bad_words);
+
+    if (node_id == 0) {
+        memcpy(v_input_ids.data(), v_start_ids.data(), size_1 * sizeof(int));
+        memcpy(v_input_lengths.data(), v_start_lengths.data(), size_2 * sizeof(int));
+        memcpy(v_input_bad_words.data(), v_bad_words.data(), size_bad_words * sizeof(int));
+    }
+    ft::mpi::barrier();
+
+    int request_batch_size = size_2;
+    int max_input_len      = size_1 / size_2;
+
+    ft::mpi::bcast(v_input_ids.data(), size_1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+    ft::mpi::bcast(v_input_lengths.data(), size_2, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+    ft::mpi::bcast(v_input_bad_words.data(), size_bad_words, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list;
+    for (int device_id = 0; device_id < gpu_count; device_id++) {
+        ft::check_cuda_error(cudaSetDevice(device_id));
+
+        int* d_input_ids;
+        int* d_input_lengths;
+        int* d_input_bad_words;
+
+        if (max_input_len == 0) {
+            // unconditional case, no input ids, so do nothing.
+            d_input_ids     = nullptr;
+            d_input_lengths = nullptr;
+            max_input_len   = 0;
+        }
+        else {
+            // conditional case.
+            ft::deviceMalloc(&d_input_ids, size_1, false);
+            ft::deviceMalloc(&d_input_lengths, size_2, false);
+            ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1);
+            ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2);
+        }
+        ft::deviceMalloc(&d_input_bad_words, size_bad_words, false);
+        ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words);
+
+        uint32_t* request_output_len_ptr = (uint32_t*)malloc(request_batch_size * sizeof(uint32_t));
+        for (int i = 0; i < request_batch_size; i++) {
+            request_output_len_ptr[i] = param.request_output_len;
+        }
+
+        int* start_ids_ptr = (int*)malloc(request_batch_size * sizeof(int));
+        int* end_ids_ptr   = (int*)malloc(request_batch_size * sizeof(int));
+        for (int i = 0; i < request_batch_size; i++) {
+            start_ids_ptr[i] = param.start_id;
+            end_ids_ptr[i]   = param.end_id;
+        }
+        pointer_record->push_back(start_ids_ptr);
+        pointer_record->push_back(end_ids_ptr);
+
+        request_list.push_back(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(
+            new std::unordered_map<std::string, triton::Tensor>{
+                {"input_ids",
+                 triton::Tensor{triton::MEMORY_GPU,
+                                triton::TYPE_INT32,
+                                std::vector<size_t>{(size_t)request_batch_size, (size_t)max_input_len},
+                                d_input_ids}},
+                {"input_lengths",
+                 triton::Tensor{triton::MEMORY_GPU,
+                                triton::TYPE_INT32,
+                                std::vector<size_t>{(size_t)request_batch_size},
+                                d_input_lengths}},
+                {"request_output_len",
+                 triton::Tensor{triton::MEMORY_CPU,
+                                triton::TYPE_INT32,
+                                std::vector<size_t>{(size_t)request_batch_size},
+                                request_output_len_ptr}},
+                {"bad_words_list",
+                 triton::Tensor{
+                     triton::MEMORY_GPU, triton::TYPE_INT32, {2, v_input_bad_words.size() / 2}, d_input_bad_words}},
+                {"start_id",
+                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, start_ids_ptr}},
+                {"end_id",
+                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}}));
+
+        int* beam_width_ptr = new int(param.beam_width);
+        pointer_record->push_back(beam_width_ptr);
+        request_list[device_id]->insert(
+            {"beam_width",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, beam_width_ptr}});
+        if (param.beam_width > 1) {
+            float* beam_search_diversity_rate_ptr = new float(param.beam_search_diversity_rate);
+            pointer_record->push_back(beam_search_diversity_rate_ptr);
+            request_list[device_id]->insert(
+                {"beam_search_diversity_rate",
+                 triton::Tensor{
+                     triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, beam_search_diversity_rate_ptr}});
+        }
+        else {
+            if (param.runtime_top_p != 0.0f) {
+                float* runtime_top_p_ptr = new float(param.runtime_top_p);
+                pointer_record->push_back(runtime_top_p_ptr);
+                request_list[device_id]->insert(
+                    {"runtime_top_p",
+                     triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, runtime_top_p_ptr}});
+            }
+            if (param.runtime_top_k != 0) {
+                uint* runtime_top_k_ptr = new uint(param.runtime_top_k);
+                pointer_record->push_back(runtime_top_k_ptr);
+                request_list[device_id]->insert(
+                    {"runtime_top_k",
+                     triton::Tensor{
+                         triton::MEMORY_CPU, triton::TYPE_UINT32, std::vector<size_t>{1}, runtime_top_k_ptr}});
+            }
+        }
+        float* temperature_ptr = new float(param.temperature);
+        pointer_record->push_back(temperature_ptr);
+        request_list[device_id]->insert(
+            {"temperature",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, temperature_ptr}});
+        float* len_penalty_ptr = new float(param.len_penalty);
+        pointer_record->push_back(len_penalty_ptr);
+        request_list[device_id]->insert(
+            {"len_penalty",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, len_penalty_ptr}});
+        if (param.repetition_penalty != 1.0f) {
+            float* repetition_penalty_ptr = new float(param.repetition_penalty);
+            pointer_record->push_back(repetition_penalty_ptr);
+            request_list[device_id]->insert(
+                {"repetition_penalty",
+                 triton::Tensor{
+                     triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, repetition_penalty_ptr}});
+        }
+        if (param.presence_penalty != 0.0f) {
+            float* presence_penalty_ptr = new float(param.presence_penalty);
+            pointer_record->push_back(presence_penalty_ptr);
+            request_list[device_id]->insert(
+                {"presence_penalty",
+                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, presence_penalty_ptr}});
+        }
+        int* min_length_ptr = new int(param.min_length);
+        pointer_record->push_back(min_length_ptr);
+        request_list[device_id]->insert(
+            {"min_length",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, min_length_ptr}});
+        unsigned long long int* random_seed_ptr = new unsigned long long int(param.random_seed);
+        pointer_record->push_back(random_seed_ptr);
+        request_list[device_id]->insert(
+            {"random_seed",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_UINT64, std::vector<size_t>{1}, random_seed_ptr}});
+
+        pointer_record->push_back(d_input_ids);
+        pointer_record->push_back(d_input_lengths);
+        pointer_record->push_back(d_input_bad_words);
+        pointer_record->push_back(request_output_len_ptr);
+    }
+
+    return request_list;
+}
+
+std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
+prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record)
+{
+    INIReader reader = INIReader(ini_name);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << ini_name << "'\n";
+        ft::FT_CHECK(false);
+    }
+
+    const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
+
+    const int start_id = reader.GetInteger("llama_7b", "start_id");
+    const int end_id   = reader.GetInteger("llama_7b", "end_id");
+
+    std::vector<int> v_start_ids;
+    std::vector<int> v_start_lengths;
+
+    size_t max_input_len = 0;
+    ft::read_start_ids(request_batch_size,
+                       &v_start_lengths,
+                       &v_start_ids,
+                       max_input_len,
+                       end_id,
+                       1,
+                       "../examples/cpp/llama/start_ids.csv");
+
+    std::vector<int> v_bad_words;
+    ft::read_word_list("../examples/cpp/llama/bad_words.csv", v_bad_words);
+
+    RequestParam param;
+    param.beam_width                 = reader.GetInteger("request", "beam_width");
+    param.request_output_len         = reader.GetInteger("request", "request_output_len");
+    param.beam_search_diversity_rate = reader.GetFloat("request", "beam_search_diversity_rate");
+    param.runtime_top_k              = reader.GetInteger("request", "top_k");
+    param.runtime_top_p              = reader.GetFloat("request", "top_p");
+    param.temperature                = reader.GetFloat("request", "temperature");
+    param.len_penalty                = reader.GetFloat("request", "len_penalty");
+    param.repetition_penalty         = reader.GetFloat("request", "repetition_penalty", 1.0f);
+    param.presence_penalty           = reader.GetFloat("request", "presence_penalty", 0.0f);
+    param.min_length                 = reader.GetInteger("request", "min_length", 0);
+    param.random_seed                = (unsigned long long int)0;
+    param.start_id                   = start_id;
+    param.end_id                     = end_id;
+
+    auto request_list =
+        broadCastRequest(v_start_ids, v_start_lengths, v_bad_words, node_id, gpu_count, param, pointer_record);
+    return request_list;
+}
+
+int threadCreateModelInstances(std::shared_ptr<AbstractTransformerModel>                         model,
+                               std::vector<std::unique_ptr<AbstractTransformerModelInstance>>*   model_instances,
+                               const int                                                         device_id,
+                               const int                                                         rank,
+                               std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                               std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr)
+{
+    printf("[INFO] rank = %d \n", rank);
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    cudaStream_t stream;
+    ft::check_cuda_error(cudaStreamCreate(&stream));
+    model->createSharedWeights(device_id, rank);
+    auto model_instance = model->createModelInstance(device_id, rank, stream, nccl_params, custom_all_reduce_comm);
+    model_instances->at(device_id) = std::move(model_instance);
+    printf("model instance %d is created \n", device_id);
+    ft::print_mem_usage();
+    return 0;
+}
+
+int threadForward(std::unique_ptr<AbstractTransformerModelInstance>*                model_instance,
+                  std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>  request,
+                  std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>* output_tensors,
+                  const int                                                         device_id)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    *output_tensors = (*model_instance)->forward(request);
+    return 0;
+}
+
+int main(int argc, char* argv[])
+{
+    /*
+        Prepare the nccl ids, node id, device id and world size
+        by MPI or triton
+    */
+
+    // MPICHECK(MPI_Init(&argc, &argv));
+    ft::mpi::initialize(&argc, &argv);
+    int node_id  = ft::mpi::getCommWorldRank();
+    int node_num = ft::mpi::getCommWorldSize();
+    std::cout << "node_id: " << node_id << ", node_num: " << node_num << std::endl;
+
+    // Note: Only supports that all nodes have same gpu count
+    const int   gpu_count  = ft::getDeviceCount();
+    std::cout << "gpu_count: " << gpu_count << std::endl;
+    const int   world_size = node_num * gpu_count;
+    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini";
+
+    // step 1: Create model
+    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createLlamaModel(ini_name);
+    int                                       tensor_para_size = model->getTensorParaSize();
+    int                                       pipeline_para_size = model->getPipelineParaSize();
+    FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size),
+                       "World Size != Tensor Parallel Size * Pipeline Parallel Size !");
+
+    std::cout << model->toString();
+
+    // step 2: Initialize the NCCL
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_comms = model->createNcclParams(node_id);
+    cudaDeviceSynchronize();
+
+    // Optional Step: create custom all reduce comm
+    std::vector<std::shared_ptr<ft::AbstractCustomComm>> custom_all_reduce_comms;
+    model->createCustomComms(&custom_all_reduce_comms, world_size);
+
+    // step 3: Create model instances
+    std::vector<std::unique_ptr<AbstractTransformerModelInstance>> model_instances((size_t)gpu_count);
+    std::vector<std::thread>                                       threads;
+    for (int device_id = 0; device_id < gpu_count; device_id++) {
+        const int rank = node_id * gpu_count + device_id;
+        threads.push_back(std::thread(threadCreateModelInstances,
+                                      model,
+                                      &model_instances,
+                                      device_id,
+                                      rank,
+                                      nccl_comms,
+                                      custom_all_reduce_comms[rank]));
+    }
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    // step 4: prepare request
+    std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record);
+    printf("[INFO] request is created \n");
+
+    // step 5: Forward
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> output_tensors_lists(
+        (size_t)gpu_count);
+    for (int i = 0; i < 2; i++) {
+        threads.clear();
+        for (int device_id = 0; device_id < gpu_count; device_id++) {
+            threads.push_back(std::thread(threadForward,
+                                          &model_instances[device_id],
+                                          request_list[device_id],
+                                          &output_tensors_lists[device_id],
+                                          device_id));
+        }
+        for (auto& t : threads) {
+            t.join();
+        }
+    }
+    printf("[INFO] forward is completed. \n");
+
+    const int* d_output_ids = (const int*)output_tensors_lists[0].get()->at("output_ids").data;
+    const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
+    const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
+    const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
+    const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("input_lengths").data;
+    // step 6: check results
+    if (node_id == 0) {
+
+        std::string fName   = "out";
+        auto        outFile = std::ofstream(fName, std::ios::out);
+        if (!outFile.is_open()) {
+            printf("[WARNING] Cannot write results into output file %s \n", fName.c_str());
+        }
+        else {
+            size_t outCount = batch_size * beam_width * seq_len;
+            int*   hBuf     = new int[outCount];
+            int*   iBuf     = new int[batch_size];
+            ft::cudaD2Hcpy(hBuf, d_output_ids, outCount);
+            ft::cudaD2Hcpy(iBuf, d_input_lengths, batch_size);
+            
+
+            {
+                std::cout << "Writing " << outCount << " elements\n";
+                int zeroCount = 0;
+                for (int i=0; i<batch_size; i++) {
+                    printf("%d ", iBuf[i]);
+                }
+                printf("\n");
+                for (size_t i = 0; i < outCount; i++) {
+                    if (hBuf[i] == int(0))
+                        zeroCount++;
+                    outFile << hBuf[i] << " ";
+                    if ((i + 1) % (seq_len) == 0)
+                        outFile << std::endl;
+
+                    // if (i < 10)
+                        printf("%5d ", hBuf[i]);
+                    // if ((i + 1) % (seq_len) == 0 && i < 10)
+                    //     std::cout << std::endl;
+                }
+                std::cout << std::endl << "zeroCount = " << zeroCount << std::endl;
+            }
+            delete[] hBuf;
+        }
+    }
+
+    // test time
+    struct timeval start, end;
+    ft::mpi::barrier();
+    cudaDeviceSynchronize();
+    gettimeofday(&start, NULL);
+
+    const int ite = 1;
+    for (int i = 0; i < ite; i++) {
+        threads.clear();
+        for (int device_id = 0; device_id < gpu_count; device_id++) {
+            threads.push_back(std::thread(threadForward,
+                                          &model_instances[device_id],
+                                          request_list[device_id],
+                                          &output_tensors_lists[device_id],
+                                          device_id));
+        }
+        for (auto& t : threads) {
+            t.join();
+        }
+    }
+
+    cudaDeviceSynchronize();
+    ft::mpi::barrier();
+
+    gettimeofday(&end, NULL);
+
+    printf("[INFO] batch_size %d beam_width %d seq_len %d"
+           " FT-CPP-GPT-Triton-time %.2f ms\n",
+           batch_size,
+           beam_width,
+           seq_len,
+           ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
+
+    ft::mpi::finalize();
+    return 0;
+}
diff --git a/examples/cpp/bart/check_with_huggingface.py b/examples/cpp/bart/check_with_huggingface.py
new file mode 100644
index 000000000..d1f356cc1
--- /dev/null
+++ b/examples/cpp/bart/check_with_huggingface.py
@@ -0,0 +1,16 @@
+import transformers
+
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+tokenizer = LlamaTokenizer.from_pretrained('/data/llama-7b-hf')
+
+prompt = "Hey, are you consciours? Can you talk to me?"
+inputs = tokenizer(prompt, return_tensors='pt')
+model = LlamaForCausalLM.from_pretrained("/data/llama-7b-hf")
+hf_config = vars(model.config)
+print(hf_config)
+generated_ids = model.forward(inputs.input_ids, output_hidden_states=True)
+print(generated_ids)
+
+tokens = [0,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366]
+print(tokenizer.decode(tokens))
diff --git a/examples/cpp/bart/start_ids.csv b/examples/cpp/bart/start_ids.csv
new file mode 100644
index 000000000..6b8b9c375
--- /dev/null
+++ b/examples/cpp/bart/start_ids.csv
@@ -0,0 +1,8 @@
+1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+1, 18637
+1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+1, 18637, 29892, 526, 366, 1136
diff --git a/examples/cpp/bart/stop_words.csv b/examples/cpp/bart/stop_words.csv
new file mode 100644
index 000000000..9b9b09eba
--- /dev/null
+++ b/examples/cpp/bart/stop_words.csv
@@ -0,0 +1,2 @@
+287, 4346, 12
+3, -1, -1

From 1a6cbeada7e8a0d2c04fe81ebf9fb186b240fb6c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:01:12 -0700
Subject: [PATCH 066/262] commit

---
 .../models/bart/BartDecoderLayerWeight.cc     | 76 +++++++++----------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index 6e5eb4dea..c2adb3b06 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -287,87 +287,87 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
     cross_attention_weights.attention_output_weight.kernel = weights_ptr[7];
     cross_attn_layernorm_weights.gamma                     = weights_ptr[8];
 
-    loadWeightFromBin<T>(weights_ptr_[0],
-                         {weights_size_[0]},
+    loadWeightFromBin<T>(weights_ptr[0],
+                         {weights_size[0]},
                          dir_path + "layer.SelfAttention.final_layer_norm.weight." + tp_rank + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(weights_ptr_[1],
-                         {weights_size_[1]},
+    loadWeightFromBin<T>(weights_ptr[1],
+                         {weights_size[1]},
                          dir_path + "layer.SelfAttention.qkv.weight." + tp_rank + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(weights_ptr_[2],
-                         {weights_size_[2]},
+    loadWeightFromBin<T>(weights_ptr[2],
+                         {weights_size[2]},
                          dir_path + "layer.SelfAttention.out_proj.weight." + tp_rank + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(weights_ptr_[3],
-                         {weights_size_[3]},
+    loadWeightFromBin<T>(weights_ptr[3],
+                         {weights_size[3]},
                          dir_path + "layer.SelfAttention.attn_layer_norm.weight." + tp_rank + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(weights_ptr_[4],
-                         {weights_size_[4]},
+    loadWeightFromBin<T>(weights_ptr[4],
+                         {weights_size[4]},
                          dir_path + "layer.CrossAttention.q.weight." + tp_rank + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(weights_ptr_[5],
-                         {weights_size_[5]},
+    loadWeightFromBin<T>(weights_ptr[5],
+                         {weights_size[5]},
                          dir_path + "layer.CrossAttention.k.weight." + tp_rank + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(weights_ptr_[6],
-                         {weights_size_[6]},
+    loadWeightFromBin<T>(weights_ptr[6],
+                         {weights_size[6]},
                          dir_path + "layer.CrossAttention.v.weight." + tp_rank + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(weights_ptr_[7],
-                         {weights_size_[7]},
+    loadWeightFromBin<T>(weights_ptr[7],
+                         {weights_size[7]},
                          dir_path + "layer.CrossAttention.out_proj.weight." + tp_rank + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(weights_ptr_[8],
-                        {weights_size_[8]},
+    loadWeightFromBin<T>(weights_ptr[8],
+                        {weights_size[8]},
                         dir_path + "layer.CrossAttention.attn_layer_norm.weight.bin",
                         model_file_type);
 
         ffn_weights.intermediate_weight.kernel = weights_ptr[9];
         ffn_weights.output_weight.kernel       = weights_ptr[10];
 
-    loadWeightFromBin<T>(weights_ptr_[9],
-                        {weights_size_[9]},
+    loadWeightFromBin<T>(weights_ptr[9],
+                        {weights_size[9]},
                         dir_path + "layer.SelfAttention.fc2.weight.bin",
                         model_file_type);
-    loadWeightFromBin<T>(weights_ptr_[10],
-                        {weights_size_[10]},
+    loadWeightFromBin<T>(weights_ptr[10],
+                        {weights_size[10]},
                         dir_path + "layer.SelfAttention.final_layer_norm.weight.bin",
                         model_file_type);
 
     if (bart_with_bias_) {
-        loadWeightFromBin<T>(weights_ptr_[8],
-                            {weights_size_[8]},
+        loadWeightFromBin<T>(weights_ptr[8],
+                            {weights_size[8]},
                             dir_path + "layer.SelfAttention.q.bias." + tp_rank + ".bin",
                             model_file_type);
-        loadWeightFromBin<T>(weights_ptr_[9],
-                            {weights_size_[9]},
+        loadWeightFromBin<T>(weights_ptr[9],
+                            {weights_size[9]},
                             dir_path + "layer.SelfAttention.k.bias." + tp_rank + ".bin",
                             model_file_type);
-        loadWeightFromBin<T>(weights_ptr_[10],
-                            {weights_size_[10]},
+        loadWeightFromBin<T>(weights_ptr[10],
+                            {weights_size[10]},
                             dir_path + "layer.SelfAttention.v.bias." + tp_rank + ".bin",
                             model_file_type);
-        loadWeightFromBin<T>(weights_ptr_[11],
-                            {weights_size_[11]},
+        loadWeightFromBin<T>(weights_ptr[11],
+                            {weights_size[11]},
                             dir_path + "layer.SelfAttention.out_proj.bias." + tp_rank + ".bin",
                             model_file_type);
-        loadWeightFromBin<T>(weights_ptr_[12],
-                            {weights_size_[12]},
+        loadWeightFromBin<T>(weights_ptr[12],
+                            {weights_size[12]},
                             dir_path + "layer.SelfAttention.attn_layer_norm.bias.bin",
                             model_file_type);
 
-        loadWeightFromBin<T>(weights_ptr_[13],
-                            {weights_size_[13]},
+        loadWeightFromBin<T>(weights_ptr[13],
+                            {weights_size[13]},
                             dir_path + "layer.SelfAttention.fc1.bias.bin",
                             model_file_type);
-        loadWeightFromBin<T>(weights_ptr_[14],
-                            {weights_size_[14]},
+        loadWeightFromBin<T>(weights_ptr[14],
+                            {weights_size[14]},
                             dir_path + "layer.SelfAttention.fc2.bias.bin",
                             model_file_type);
-        loadWeightFromBin<T>(weights_ptr_[15],
-                            {weights_size_[15]},
+        loadWeightFromBin<T>(weights_ptr[15],
+                            {weights_size[15]},
                             dir_path + "layer.SelfAttention.final_layer_norm.bias.bin",
                             model_file_type);       
     }

From 820f68cceba4e83956545c97523cb5d9cac75e2c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:02:20 -0700
Subject: [PATCH 067/262] commit

---
 src/fastertransformer/models/bart/BartDecodingWeight.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartDecodingWeight.cc b/src/fastertransformer/models/bart/BartDecodingWeight.cc
index cd67625d5..cede06c46 100644
--- a/src/fastertransformer/models/bart/BartDecodingWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecodingWeight.cc
@@ -256,6 +256,9 @@ void BartDecodingWeight<T>::loadModel(std::string dir_path)
 {
     FT_LOG_DEBUG("BartDecodingWeight " + std::string(__func__) + " start");
 
+    FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "decoder");
+    FT_CHECK(is_maintain_buffer == true);
+
     loadWeightFromBin<T>(weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/decoder.embed_positions.weight.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/decoder.embed_tokens.weight.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[2], {(size_t)weights_size[2]}, dir_path + "/decoder.lm_head.weight.bin", model_file_type);

From b84f60cf4498a016d9fa78440e4d742dd7ea536e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:02:40 -0700
Subject: [PATCH 068/262] commit

---
 src/fastertransformer/models/bart/BartDecodingWeight.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecodingWeight.cc b/src/fastertransformer/models/bart/BartDecodingWeight.cc
index cede06c46..f990d49c0 100644
--- a/src/fastertransformer/models/bart/BartDecodingWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecodingWeight.cc
@@ -257,7 +257,7 @@ void BartDecodingWeight<T>::loadModel(std::string dir_path)
     FT_LOG_DEBUG("BartDecodingWeight " + std::string(__func__) + " start");
 
     FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "decoder");
-    FT_CHECK(is_maintain_buffer == true);
+    FT_CHECK(is_maintain_buffer_ == true);
 
     loadWeightFromBin<T>(weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/decoder.embed_positions.weight.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/decoder.embed_tokens.weight.bin", model_file_type);

From 9b465529bc9bc0afbec56561ba9c5a05c9fead13 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:05:19 -0700
Subject: [PATCH 069/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 3df2a2203..41310901e 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -316,7 +316,7 @@ int main(int argc, char* argv[])
     std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini";
 
     // step 1: Create model
-    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createLlamaModel(ini_name);
+    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createBartModel(ini_name);
     int                                       tensor_para_size = model->getTensorParaSize();
     int                                       pipeline_para_size = model->getPipelineParaSize();
     FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size),

From 2051da907c84d57816223538c97c8b2b08f5ecce Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:05:46 -0700
Subject: [PATCH 070/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 41310901e..42a6137c2 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -16,8 +16,8 @@
 
 #include "3rdparty/INIReader.h"
 #include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h"
-#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
-#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/bart/BartTritonModel.h"
+#include "src/fastertransformer/triton_backend/bart/BartTritonModelInstance.h"
 #include "src/fastertransformer/utils/custom_ar_comm.h"
 #include "src/fastertransformer/utils/mpi_utils.h"
 #include "src/fastertransformer/utils/nccl_utils.h"

From 5ffda86fd8ce50518d7c17470a1d9fca84b6165e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:06:35 -0700
Subject: [PATCH 071/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 42a6137c2..402201dc0 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
     const int   gpu_count  = ft::getDeviceCount();
     std::cout << "gpu_count: " << gpu_count << std::endl;
     const int   world_size = node_num * gpu_count;
-    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini";
+    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/FasterTransformer/examples/cpp/bart/bart_config.ini";
 
     // step 1: Create model
     std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createBartModel(ini_name);

From 755fd9c652129ec29c638595342975d0d19b950e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:07:06 -0700
Subject: [PATCH 072/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 402201dc0..0d05d0ef9 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
     const int   gpu_count  = ft::getDeviceCount();
     std::cout << "gpu_count: " << gpu_count << std::endl;
     const int   world_size = node_num * gpu_count;
-    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/FasterTransformer/examples/cpp/bart/bart_config.ini";
+    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/tmp/FasterTransformer/examples/cpp/bart/bart_config.ini";
 
     // step 1: Create model
     std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createBartModel(ini_name);

From 6abce2a081e9585ee4c91187b414a5397fb8a900 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:07:58 -0700
Subject: [PATCH 073/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 0d05d0ef9..71c8bb083 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
     const int   gpu_count  = ft::getDeviceCount();
     std::cout << "gpu_count: " << gpu_count << std::endl;
     const int   world_size = node_num * gpu_count;
-    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/tmp/FasterTransformer/examples/cpp/bart/bart_config.ini";
+    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/tmp/FasterTransformer/examples/cpp/bart/";
 
     // step 1: Create model
     std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createBartModel(ini_name);

From 673c7ca410fdc11fa1d666185db7985dcefe3418 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:08:10 -0700
Subject: [PATCH 074/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 71c8bb083..0d05d0ef9 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
     const int   gpu_count  = ft::getDeviceCount();
     std::cout << "gpu_count: " << gpu_count << std::endl;
     const int   world_size = node_num * gpu_count;
-    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/tmp/FasterTransformer/examples/cpp/bart/";
+    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/tmp/FasterTransformer/examples/cpp/bart/bart_config.ini";
 
     // step 1: Create model
     std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createBartModel(ini_name);

From 9a93d0a1a460fe04a55db69ee5d4ca4f4b51fcad Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:12:11 -0700
Subject: [PATCH 075/262] commit

---
 examples/cpp/bart/bart_triton_example.cc          | 2 +-
 examples/cpp/bart/{bart_config.ini => config.ini} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/cpp/bart/{bart_config.ini => config.ini} (100%)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 0d05d0ef9..71c8bb083 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
     const int   gpu_count  = ft::getDeviceCount();
     std::cout << "gpu_count: " << gpu_count << std::endl;
     const int   world_size = node_num * gpu_count;
-    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/tmp/FasterTransformer/examples/cpp/bart/bart_config.ini";
+    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/tmp/FasterTransformer/examples/cpp/bart/";
 
     // step 1: Create model
     std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createBartModel(ini_name);
diff --git a/examples/cpp/bart/bart_config.ini b/examples/cpp/bart/config.ini
similarity index 100%
rename from examples/cpp/bart/bart_config.ini
rename to examples/cpp/bart/config.ini

From db339ca24ad5fd362e519e368567e7ec8b292e3f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:13:49 -0700
Subject: [PATCH 076/262] commit

---
 examples/cpp/bart/config.ini | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/examples/cpp/bart/config.ini b/examples/cpp/bart/config.ini
index ef789d35d..a97e3b434 100644
--- a/examples/cpp/bart/config.ini
+++ b/examples/cpp/bart/config.ini
@@ -21,14 +21,20 @@ request_batch_size=8 # determine by the request
 request_output_len=32 # determine by the request
 
 [llama_7b]
-head_num = 64
-kv_head_num = 8
-size_per_head = 128
-inter_size = 28672
-num_layer = 3
-rotary_embedding = 128
-layernorm_eps = 1e-05
-vocab_size = 32000
-start_id = 1
-end_id = 2
-weight_data_type = fp16
+[encoder]
+num_heads = 12
+d_kv = 64
+d_model = 768
+d_ff = 3072
+num_layers = 6
+vocab_size = 50265
+max_pos_seq_len = 1024
+
+[decoder]
+num_heads = 12
+d_kv = 64
+d_model = 768
+d_ff = 3072
+num_layers = 6
+vocab_size = 50265
+max_pos_seq_len = 1024

From be63f1d4440947aeae8adab8d3c80ac012b17a34 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:31:07 -0700
Subject: [PATCH 077/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 0d33dd91b..9980f072d 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -295,6 +295,7 @@ def convert_checkpoint(args):
     config["decoder"]["num_layers"] = str(hf_config["decoder_layers"])
     config["decoder"]["vocab_size"] = str(hf_config["vocab_size"])
     config["decoder"]["max_pos_seq_len"] = str(hf_config["max_position_embeddings"])
+    config["decoder"]["decoder_start_token_id"] = str(hf_config["decoder_start_token_id"])
 
     with open((saved_dir / "config.ini").as_posix(), 'w') as configfile:
         config.write(configfile)

From 43b34b9505c00e8f94a046aac8dbffaa4cacb337 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:33:38 -0700
Subject: [PATCH 078/262] commit

---
 examples/cpp/bart/config.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cpp/bart/config.ini b/examples/cpp/bart/config.ini
index a97e3b434..f1390992b 100644
--- a/examples/cpp/bart/config.ini
+++ b/examples/cpp/bart/config.ini
@@ -38,3 +38,4 @@ d_ff = 3072
 num_layers = 6
 vocab_size = 50265
 max_pos_seq_len = 1024
+decoder_start_token_id = 2

From 59ec3cfdcd8fbea5fd0f27cf7216086c3a7958b4 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:34:00 -0700
Subject: [PATCH 079/262] commit

---
 examples/cpp/bart/config.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cpp/bart/config.ini b/examples/cpp/bart/config.ini
index f1390992b..ffafd4c53 100644
--- a/examples/cpp/bart/config.ini
+++ b/examples/cpp/bart/config.ini
@@ -39,3 +39,4 @@ num_layers = 6
 vocab_size = 50265
 max_pos_seq_len = 1024
 decoder_start_token_id = 2
+eos_token_id = 2

From 3a17aafc3501baed317ea1d0a47c400dc44b0d18 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:34:22 -0700
Subject: [PATCH 080/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 9980f072d..1572dfd2f 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -296,6 +296,7 @@ def convert_checkpoint(args):
     config["decoder"]["vocab_size"] = str(hf_config["vocab_size"])
     config["decoder"]["max_pos_seq_len"] = str(hf_config["max_position_embeddings"])
     config["decoder"]["decoder_start_token_id"] = str(hf_config["decoder_start_token_id"])
+    config["decoder"]["eos_token_id"] = str(hf_config["eos_token_id"])
 
     with open((saved_dir / "config.ini").as_posix(), 'w') as configfile:
         config.write(configfile)

From b884f2766285b2de295332f78f95d3e9282b3266 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 21:59:29 -0700
Subject: [PATCH 081/262] commit

---
 src/fastertransformer/triton_backend/bart/BartTritonModel.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 2082a5e62..ca6f1e1f8 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -138,6 +138,7 @@ BartTritonModel<T>::createModelInstance(int
                                       std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
                                       std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
 {
+    printf("createModelInstance\n");
     ft::check_cuda_error(cudaSetDevice(device_id));
     const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
 

From e8b03a25299b95d39410f512a85ffa928936cb40 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:00:27 -0700
Subject: [PATCH 082/262] commit

---
 src/fastertransformer/triton_backend/bart/BartTritonModel.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index ca6f1e1f8..2c1d49010 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -255,7 +255,8 @@ BartTritonModel<T>::createModelInstance(int
 
 template<typename T>
 void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
-{
+{   
+    printf("createSharedWeights\n");
     ft::check_cuda_error(cudaSetDevice(device_id));
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;

From 3afe0a6abbd809227c1f3e611bceea053bbab718 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:01:03 -0700
Subject: [PATCH 083/262] commit

---
 src/fastertransformer/triton_backend/bart/BartTritonModel.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 2c1d49010..5cbc4f191 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -296,6 +296,7 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                   use_gated_activation_,
                                                   position_embedding_type_);
 
+    printf("load model\n");
     encoder_shared_weights_[device_id]->loadModel(model_dir_);
     decoding_shared_weights_[device_id]->loadModel(model_dir_);
 }

From 42355a4b4fd289b5fb4aadffeb2fb4451492dea3 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:02:21 -0700
Subject: [PATCH 084/262] commit

---
 src/fastertransformer/triton_backend/bart/BartTritonModel.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 5cbc4f191..c826d92c1 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -261,6 +261,7 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;
 
+    printf("BartEncoderWeight\n");
     encoder_shared_weights_[device_id] =
         std::make_shared<ft::BartEncoderWeight<T>>(encoder_head_num_,
                                                  encoder_size_per_head_,
@@ -278,6 +279,7 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                  use_gated_activation_,
                                                  position_embedding_type_);
 
+    printf("BartDecodingWeight\n");
     decoding_shared_weights_[device_id] =
         std::make_shared<ft::BartDecodingWeight<T>>(decoding_head_num_,
                                                   decoding_size_per_head_,

From f7dfda2748de38ea234a4e472631d92a0382dd8e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:03:58 -0700
Subject: [PATCH 085/262] commit

---
 src/fastertransformer/models/bart/BartEncoderWeight.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index b4b8a4772..ac1648711 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -62,6 +62,7 @@ BartEncoderWeight<T>::BartEncoderWeight(const size_t                head_num,
     setWeightPtr();
     bart_encoder_layer_weights.clear();
     bart_encoder_layer_weights.reserve(num_layer_);
+    printf("bart_encoder_layer_weights.reserve(num_layer_);\n");
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
             bart_encoder_layer_weights.push_back(new BartEncoderLayerWeight<T>(head_num_,

From db8ba0818c9d259d4c792826d63ec829b8f70bcc Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:04:46 -0700
Subject: [PATCH 086/262] commit

---
 src/fastertransformer/models/bart/BartEncoderLayerWeight.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc b/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc
index 4dff65721..9714cfa35 100644
--- a/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc
@@ -38,6 +38,7 @@ BartEncoderLayerWeight<T>::BartEncoderLayerWeight(const size_t head_num,
     bart_with_bias_(bart_with_bias),
     use_gated_activation_(use_gated_activation)
 {
+    printf("BartEncoderLayerWeight\n");
     real_weights_num_ = (8 + (use_gated_activation_ ? 1 : 0))
                         * (bart_with_bias_ ? 2 : 1);  // 8: Q, K, V, O, LayerNorm1, FC1, FC2, LayerNorm2
     FT_LOG_DEBUG("BartEncoderLayerWeight " + std::string(__func__) + " start");

From d1d5f32382e5aa2fe514ddb047b1c472effe6a06 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:05:26 -0700
Subject: [PATCH 087/262] commit

---
 src/fastertransformer/models/bart/BartEncoderWeight.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index ac1648711..eaeddfa03 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -80,6 +80,7 @@ BartEncoderWeight<T>::BartEncoderWeight(const size_t                head_num,
         }
     }
     FT_LOG_DEBUG("BartEncoderWeight " + std::string(__func__) + " end");
+    printf("BartEncoderWeight Done\n");
 }
 
 template<typename T>

From 5223f3cc444a58dee2dc35469aee260bc7c30b3b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:09:38 -0700
Subject: [PATCH 088/262] commit

---
 .../triton_backend/bart/BartTritonModel.cc    | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index c826d92c1..29ed83709 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -280,27 +280,27 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                  position_embedding_type_);
 
     printf("BartDecodingWeight\n");
-    decoding_shared_weights_[device_id] =
-        std::make_shared<ft::BartDecodingWeight<T>>(decoding_head_num_,
-                                                  decoding_size_per_head_,
-                                                  decoding_d_model_,
-                                                  decoding_inter_size_,
-                                                  decoding_vocab_size_,
-                                                  decoding_num_layer_,
-                                                  encoder_d_model_,
-                                                  decoding_max_pos_seq_len_,
-                                                  tensor_para_size_,
-                                                  tensor_para_rank,
-                                                  pipeline_para_size_,
-                                                  pipeline_para_rank,
-                                                  bart_with_bias_,
-                                                  mbart_para_,
-                                                  use_gated_activation_,
-                                                  position_embedding_type_);
-
-    printf("load model\n");
-    encoder_shared_weights_[device_id]->loadModel(model_dir_);
-    decoding_shared_weights_[device_id]->loadModel(model_dir_);
+    // decoding_shared_weights_[device_id] =
+    //     std::make_shared<ft::BartDecodingWeight<T>>(decoding_head_num_,
+    //                                               decoding_size_per_head_,
+    //                                               decoding_d_model_,
+    //                                               decoding_inter_size_,
+    //                                               decoding_vocab_size_,
+    //                                               decoding_num_layer_,
+    //                                               encoder_d_model_,
+    //                                               decoding_max_pos_seq_len_,
+    //                                               tensor_para_size_,
+    //                                               tensor_para_rank,
+    //                                               pipeline_para_size_,
+    //                                               pipeline_para_rank,
+    //                                               bart_with_bias_,
+    //                                               mbart_para_,
+    //                                               use_gated_activation_,
+    //                                               position_embedding_type_);
+
+    // printf("load model\n");
+    // encoder_shared_weights_[device_id]->loadModel(model_dir_);
+    // decoding_shared_weights_[device_id]->loadModel(model_dir_);
 }
 
 template<typename T>

From 5b6617900fc3314def3f50eb723ee8ffa74ef1f6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:12:04 -0700
Subject: [PATCH 089/262] commit

---
 src/fastertransformer/models/bart/BartEncoderWeight.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index eaeddfa03..a28485722 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -156,6 +156,7 @@ BartEncoderWeight<T>::BartEncoderWeight(const BartEncoderWeight& other):
     position_embedding_type(other.position_embedding_type),
     real_weights_num_(other.real_weights_num_)
 {
+    printf("Copy BartEncoderWeight\n");
     FT_LOG_DEBUG("BartEncoderWeight " + std::string(__func__) + " start");
     initialize();
     mallocWeights();

From 108f295713396d1831aa27b46b57c549564bea9f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:16:17 -0700
Subject: [PATCH 090/262] commit

---
 src/fastertransformer/triton_backend/bart/BartTritonModel.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 29ed83709..3d5d4d024 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -261,7 +261,7 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;
 
-    printf("BartEncoderWeight\n");
+    printf("BartEncoderWeight %d %d\n", encoder_shared_weights_.size(), device_id);
     encoder_shared_weights_[device_id] =
         std::make_shared<ft::BartEncoderWeight<T>>(encoder_head_num_,
                                                  encoder_size_per_head_,

From becd8682de7b4ca1509dd0f62d57a45cb3e4e643 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:23:46 -0700
Subject: [PATCH 091/262] commit

---
 .../triton_backend/bart/BartTritonModel.cc                 | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 3d5d4d024..a6fa5c72a 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -32,15 +32,16 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createBartMo
 
     const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type");
     if (data_type == "fp16") {
-        return std::make_shared<BartTritonModel<half>>(reader, model_dir);
+        // return std::make_shared<BartTritonModel<half>>(reader, model_dir);
+        return std::make_shared<BartTritonModel<half>>(1, 1, 0, model_dir, 0);
     }
 #ifdef ENABLE_BF16
     else if (data_type == "bf16") {
-        return std::make_shared<BartTritonModel<__nv_bfloat16>>(reader, model_dir);
+        return std::make_shared<BartTritonModel<__nv_bfloat16>>(1, 1, 0, model_dir, 0);
     }
 #endif
     else if (data_type == "fp32") {
-        return std::make_shared<BartTritonModel<float>>(reader, model_dir);
+        return std::make_shared<BartTritonModel<float>>(1, 1, 0, model_dir, 0);
     }
     else {
         FT_LOG_ERROR("Unsupported data type " + data_type);

From 1a40c516c6cfb39d99380c73797a92e011f0bc95 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:25:01 -0700
Subject: [PATCH 092/262] commit

---
 src/fastertransformer/triton_backend/bart/BartTritonModel.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index a6fa5c72a..4aca7695f 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -70,7 +70,7 @@ BartTritonModel<T>::BartTritonModel(INIReader reader, std::string model_dir): mo
     decoding_vocab_size_    = reader.GetInteger("decoder", "vocab_size");
     decoding_max_pos_seq_len_ = reader.GetInteger("decoder", "max_pos_seq_len");
 
-    start_id_                 = reader.GetInteger("decoder", "decoder_start_token_id");
+    start_id_                 = reader.GetInteger("decoder", "decoder_start_token_id"); 
     end_id_                   = reader.GetInteger("decoder", "eos_token_id");
     tensor_para_size_         = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     pipeline_para_size_       = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
@@ -101,7 +101,7 @@ BartTritonModel<T>::BartTritonModel(size_t      tensor_para_size,
 
     ft::FT_CHECK(int8_mode_ == 0);
 
-    model_name_ = reader.Get("encoder", "_name_or_path");
+    model_name_ = reader.Get("encoder", "model_name");
     // encoder
     encoder_head_num_      = reader.GetInteger("encoder", "num_heads");
     encoder_size_per_head_ = reader.GetInteger("encoder", "d_kv");

From ebf12e9f2c2399c95f2234493bdb05f9189064eb Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:25:26 -0700
Subject: [PATCH 093/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 1572dfd2f..c0b36b1f0 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -279,6 +279,7 @@ def convert_checkpoint(args):
     config = configparser.ConfigParser()
 
     config["encoder"] = {}
+    config["encoder"]["model_name"] = "bart"
     config["encoder"]["num_heads"] = str(hf_config["encoder_attention_heads"])
     config["encoder"]["d_kv"] = str(hf_config["d_model"] // hf_config["encoder_attention_heads"])
     config["encoder"]["d_model"] = str(hf_config["d_model"])

From 821e70f912d5da517999da5ed66c785912ff84a9 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:25:41 -0700
Subject: [PATCH 094/262] commit

---
 examples/cpp/bart/config.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cpp/bart/config.ini b/examples/cpp/bart/config.ini
index ffafd4c53..60f63140f 100644
--- a/examples/cpp/bart/config.ini
+++ b/examples/cpp/bart/config.ini
@@ -29,6 +29,7 @@ d_ff = 3072
 num_layers = 6
 vocab_size = 50265
 max_pos_seq_len = 1024
+model_name=bart
 
 [decoder]
 num_heads = 12

From 8a54c97acc85ea2606f887b67a456de3674f20d6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:29:33 -0700
Subject: [PATCH 095/262] commit

---
 examples/cpp/bart/config.ini                                 | 3 ++-
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/bart/config.ini b/examples/cpp/bart/config.ini
index 60f63140f..7da37c255 100644
--- a/examples/cpp/bart/config.ini
+++ b/examples/cpp/bart/config.ini
@@ -29,7 +29,8 @@ d_ff = 3072
 num_layers = 6
 vocab_size = 50265
 max_pos_seq_len = 1024
-model_name=bart
+model_name = bart
+feed_forward_proj = 
 
 [decoder]
 num_heads = 12
diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index c0b36b1f0..6779245a2 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -287,6 +287,7 @@ def convert_checkpoint(args):
     config["encoder"]["num_layers"] = str(hf_config["encoder_layers"])
     config["encoder"]["vocab_size"] = str(hf_config["vocab_size"])
     config["encoder"]["max_pos_seq_len"] = str(hf_config["max_position_embeddings"])
+    config["encoder"]["feed_forward_proj"] = str(hf_config["activation_function"])
 
     config["decoder"] = {}
     config["decoder"]["num_heads"] = str(hf_config["decoder_attention_heads"])

From a007343d76ceb1b40b36f1a1eb43cde317c64a1c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:29:47 -0700
Subject: [PATCH 096/262] commit

---
 examples/cpp/bart/config.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/config.ini b/examples/cpp/bart/config.ini
index 7da37c255..2e8d8c3e9 100644
--- a/examples/cpp/bart/config.ini
+++ b/examples/cpp/bart/config.ini
@@ -30,7 +30,7 @@ num_layers = 6
 vocab_size = 50265
 max_pos_seq_len = 1024
 model_name = bart
-feed_forward_proj = 
+feed_forward_proj = gelu
 
 [decoder]
 num_heads = 12

From 3b3c162dfa56671a4626f2b69d27f28d82756009 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:31:26 -0700
Subject: [PATCH 097/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 71c8bb083..9e3304982 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -352,7 +352,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record);
+        prepareRequest(ini_name + "/config.ini", node_id, gpu_count, &pointer_record);
     printf("[INFO] request is created \n");
 
     // step 5: Forward

From 3afbd0b882bfe32b634e39c32690300f1dfe4d2c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:32:44 -0700
Subject: [PATCH 098/262] commit

---
 examples/cpp/bart/config.ini | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/bart/config.ini b/examples/cpp/bart/config.ini
index 2e8d8c3e9..073a59cef 100644
--- a/examples/cpp/bart/config.ini
+++ b/examples/cpp/bart/config.ini
@@ -5,7 +5,7 @@ enable_custom_all_reduce=0
 tensor_para_size=1
 pipeline_para_size=1
 
-model_name=llama_7b
+model_name=decoder
 model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/
 
 [request]
@@ -20,7 +20,6 @@ beam_search_diversity_rate=0.0
 request_batch_size=8 # determine by the request
 request_output_len=32 # determine by the request
 
-[llama_7b]
 [encoder]
 num_heads = 12
 d_kv = 64
@@ -42,3 +41,5 @@ vocab_size = 50265
 max_pos_seq_len = 1024
 decoder_start_token_id = 2
 eos_token_id = 2
+start_id = 1
+end_id = 1

From ae4b890d3839188be9444d97bb9e61e40c1f127d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:33:20 -0700
Subject: [PATCH 099/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 9e3304982..f959af890 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -229,8 +229,8 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
 
     const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
 
-    const int start_id = reader.GetInteger("llama_7b", "start_id");
-    const int end_id   = reader.GetInteger("llama_7b", "end_id");
+    const int start_id = reader.GetInteger("decoder", "start_id");
+    const int end_id   = reader.GetInteger("decoder", "end_id");
 
     std::vector<int> v_start_ids;
     std::vector<int> v_start_lengths;

From 929f8bdb7d416c14d4da99beeeb116b513687f3e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:34:09 -0700
Subject: [PATCH 100/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index f959af890..95a88b73e 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -242,10 +242,10 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
                        max_input_len,
                        end_id,
                        1,
-                       "../examples/cpp/llama/start_ids.csv");
+                       "../examples/cpp/bart/start_ids.csv");
 
     std::vector<int> v_bad_words;
-    ft::read_word_list("../examples/cpp/llama/bad_words.csv", v_bad_words);
+    ft::read_word_list("../examples/cpp/bart/bad_words.csv", v_bad_words);
 
     RequestParam param;
     param.beam_width                 = reader.GetInteger("request", "beam_width");

From 267852556f338deb50f8f0357ba55c38b1f142db Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:35:40 -0700
Subject: [PATCH 101/262] commit

---
 .../triton_backend/bart/BartTritonModel.cc    | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 4aca7695f..275886bbc 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -281,27 +281,27 @@ void BartTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                  position_embedding_type_);
 
     printf("BartDecodingWeight\n");
-    // decoding_shared_weights_[device_id] =
-    //     std::make_shared<ft::BartDecodingWeight<T>>(decoding_head_num_,
-    //                                               decoding_size_per_head_,
-    //                                               decoding_d_model_,
-    //                                               decoding_inter_size_,
-    //                                               decoding_vocab_size_,
-    //                                               decoding_num_layer_,
-    //                                               encoder_d_model_,
-    //                                               decoding_max_pos_seq_len_,
-    //                                               tensor_para_size_,
-    //                                               tensor_para_rank,
-    //                                               pipeline_para_size_,
-    //                                               pipeline_para_rank,
-    //                                               bart_with_bias_,
-    //                                               mbart_para_,
-    //                                               use_gated_activation_,
-    //                                               position_embedding_type_);
-
-    // printf("load model\n");
-    // encoder_shared_weights_[device_id]->loadModel(model_dir_);
-    // decoding_shared_weights_[device_id]->loadModel(model_dir_);
+    decoding_shared_weights_[device_id] =
+        std::make_shared<ft::BartDecodingWeight<T>>(decoding_head_num_,
+                                                  decoding_size_per_head_,
+                                                  decoding_d_model_,
+                                                  decoding_inter_size_,
+                                                  decoding_vocab_size_,
+                                                  decoding_num_layer_,
+                                                  encoder_d_model_,
+                                                  decoding_max_pos_seq_len_,
+                                                  tensor_para_size_,
+                                                  tensor_para_rank,
+                                                  pipeline_para_size_,
+                                                  pipeline_para_rank,
+                                                  bart_with_bias_,
+                                                  mbart_para_,
+                                                  use_gated_activation_,
+                                                  position_embedding_type_);
+
+    printf("load model\n");
+    encoder_shared_weights_[device_id]->loadModel(model_dir_);
+    decoding_shared_weights_[device_id]->loadModel(model_dir_);
 }
 
 template<typename T>

From 134bac2297e8fcfacaefa8b405e07b36de0171da Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:36:35 -0700
Subject: [PATCH 102/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 6779245a2..1e5172016 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -288,6 +288,7 @@ def convert_checkpoint(args):
     config["encoder"]["vocab_size"] = str(hf_config["vocab_size"])
     config["encoder"]["max_pos_seq_len"] = str(hf_config["max_position_embeddings"])
     config["encoder"]["feed_forward_proj"] = str(hf_config["activation_function"])
+    config["encoder"]["weight_data_type"] = args.weight_data_type
 
     config["decoder"] = {}
     config["decoder"]["num_heads"] = str(hf_config["decoder_attention_heads"])

From ca817579f73ab8d732d9ca768ebac0f667fd7204 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:36:47 -0700
Subject: [PATCH 103/262] commit

---
 examples/cpp/bart/config.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cpp/bart/config.ini b/examples/cpp/bart/config.ini
index 073a59cef..dbf126bb9 100644
--- a/examples/cpp/bart/config.ini
+++ b/examples/cpp/bart/config.ini
@@ -30,6 +30,7 @@ vocab_size = 50265
 max_pos_seq_len = 1024
 model_name = bart
 feed_forward_proj = gelu
+weight_data_type = fp32
 
 [decoder]
 num_heads = 12

From 58adee186e6fa8ec6886ada40835da1b1607481d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:40:27 -0700
Subject: [PATCH 104/262] commit

---
 examples/cpp/bart/config.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/config.ini b/examples/cpp/bart/config.ini
index dbf126bb9..5e37afaad 100644
--- a/examples/cpp/bart/config.ini
+++ b/examples/cpp/bart/config.ini
@@ -6,7 +6,7 @@ tensor_para_size=1
 pipeline_para_size=1
 
 model_name=decoder
-model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/
+model_dir=/notebooks/bart-ft/1/1-gpu/
 
 [request]
 beam_width=1 # beam width for beam search

From 7d2fdb6921ba55978ba505251a8360493936f15d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:40:52 -0700
Subject: [PATCH 105/262] commit

---
 examples/cpp/bart/config.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cpp/bart/config.ini b/examples/cpp/bart/config.ini
index 5e37afaad..f351289e2 100644
--- a/examples/cpp/bart/config.ini
+++ b/examples/cpp/bart/config.ini
@@ -44,3 +44,4 @@ decoder_start_token_id = 2
 eos_token_id = 2
 start_id = 1
 end_id = 1
+weight_data_type = fp32

From a799fbad0b05a2e1d299e010ebac70d56823b6c2 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:41:14 -0700
Subject: [PATCH 106/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 1e5172016..66c3bdfe1 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -300,6 +300,7 @@ def convert_checkpoint(args):
     config["decoder"]["max_pos_seq_len"] = str(hf_config["max_position_embeddings"])
     config["decoder"]["decoder_start_token_id"] = str(hf_config["decoder_start_token_id"])
     config["decoder"]["eos_token_id"] = str(hf_config["eos_token_id"])
+    config["decoder"]["weight_data_type"] = args.weight_data_type
 
     with open((saved_dir / "config.ini").as_posix(), 'w') as configfile:
         config.write(configfile)

From a5c16624f7aeb55d6eafd98ea865494b6ace6853 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:43:11 -0700
Subject: [PATCH 107/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 95a88b73e..d531c7308 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -316,7 +316,7 @@ int main(int argc, char* argv[])
     std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/tmp/FasterTransformer/examples/cpp/bart/";
 
     // step 1: Create model
-    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createBartModel(ini_name);
+    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createBartModel("/notebooks/bart-ft/1/1-gpu");
     int                                       tensor_para_size = model->getTensorParaSize();
     int                                       pipeline_para_size = model->getPipelineParaSize();
     FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size),

From 3c1b721396df260a86d17c728d092f4c6ca4ac28 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:43:40 -0700
Subject: [PATCH 108/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index d531c7308..e0a43f81a 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -316,7 +316,7 @@ int main(int argc, char* argv[])
     std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/tmp/FasterTransformer/examples/cpp/bart/";
 
     // step 1: Create model
-    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createBartModel("/notebooks/bart-ft/1/1-gpu");
+    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createBartModel("/notebooks/bart-ft/1-gpu");
     int                                       tensor_para_size = model->getTensorParaSize();
     int                                       pipeline_para_size = model->getPipelineParaSize();
     FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size),

From 79600b2ceb69d797ac6d2b3539138528e8ec8795 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:44:33 -0700
Subject: [PATCH 109/262] commit

---
 src/fastertransformer/triton_backend/bart/BartTritonModel.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
index 275886bbc..9d82e4d38 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModel.cc
@@ -30,7 +30,7 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createBartMo
         return nullptr;
     }
 
-    const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type");
+    const std::string data_type = "fp32"; //reader.Get("ft_instance_hyperparameter", "data_type");
     if (data_type == "fp16") {
         // return std::make_shared<BartTritonModel<half>>(reader, model_dir);
         return std::make_shared<BartTritonModel<half>>(1, 1, 0, model_dir, 0);

From 922a295cc3c11edd9d3b73ddc9b430227cffbf31 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:49:50 -0700
Subject: [PATCH 110/262] commit

---
 .../models/bart/BartEncoderLayerWeight.cc                 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc b/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc
index 9714cfa35..7f8f42b3c 100644
--- a/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderLayerWeight.cc
@@ -318,11 +318,11 @@ void BartEncoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
 
     loadWeightFromBin<T>(weights_ptr_[5],
                         {weights_size_[5]},
-                        dir_path + "layer.SelfAttention.fc1.weight.bin",
+                        dir_path + "layer.SelfAttention.fc1.weight." + tp_rank + ".bin",
                         model_file_type);
     loadWeightFromBin<T>(weights_ptr_[6],
                         {weights_size_[6]},
-                        dir_path + "layer.SelfAttention.fc2.weight.bin",
+                        dir_path + "layer.SelfAttention.fc2.weight." + tp_rank + ".bin",
                         model_file_type);
     loadWeightFromBin<T>(weights_ptr_[7],
                         {weights_size_[7]},
@@ -353,11 +353,11 @@ void BartEncoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
 
         loadWeightFromBin<T>(weights_ptr_[13],
                             {weights_size_[13]},
-                            dir_path + "layer.SelfAttention.fc1.bias.bin",
+                            dir_path + "layer.SelfAttention.fc1.bias." + tp_rank + ".bin",
                             model_file_type);
         loadWeightFromBin<T>(weights_ptr_[14],
                             {weights_size_[14]},
-                            dir_path + "layer.SelfAttention.fc2.bias.bin",
+                            dir_path + "layer.SelfAttention.fc2.bias." + tp_rank + ".bin",
                             model_file_type);
         loadWeightFromBin<T>(weights_ptr_[15],
                             {weights_size_[15]},

From 4a2fb6a67e00af69a945d593b5d4d21fde5969fe Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:54:23 -0700
Subject: [PATCH 111/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 66c3bdfe1..694796076 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -64,10 +64,10 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
 
 
 def split_and_convert_process(key, val, factor, saved_dir):
-    print(key)
     if val.ndim == 2:
         val = val.transpose(1, 0)
-    LOGGER.debug(f"key: {key}, val.shape: {val.shape}")
+    # LOGGER.debug(f"key: {key}, val.shape: {val.shape}")
+    print(f"key: {key}, val.shape: {val.shape}")
 
     if key.find(".embed_positions.weight") != -1:
         if key.find("encoder") != -1:

From e3bef9c8fa9d5961fbd1c7afaf38210258d7642d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:55:26 -0700
Subject: [PATCH 112/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 694796076..2ddb0284d 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -67,7 +67,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
     if val.ndim == 2:
         val = val.transpose(1, 0)
     # LOGGER.debug(f"key: {key}, val.shape: {val.shape}")
-    print(f"key: {key}, val.shape: {val.shape}")
+    print(f"key: {key}, val.shape: {val.shape} {val[:, 2].shape}")
 
     if key.find(".embed_positions.weight") != -1:
         if key.find("encoder") != -1:

From 9f75a40887e3358f028f9a41a41d5a91b62d2539 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:56:05 -0700
Subject: [PATCH 113/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 2ddb0284d..8620bdef2 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -67,7 +67,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
     if val.ndim == 2:
         val = val.transpose(1, 0)
     # LOGGER.debug(f"key: {key}, val.shape: {val.shape}")
-    print(f"key: {key}, val.shape: {val.shape} {val[:, 2].shape}")
+    print(f"key: {key}, val.shape: {val.shape} {val[:, 2:].shape}")
 
     if key.find(".embed_positions.weight") != -1:
         if key.find("encoder") != -1:
@@ -75,7 +75,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
         else:
             prefix = "decoder"
         saved_path = saved_dir / f"{prefix}.embed_positions.weight.bin"
-        val[:, 2].tofile(saved_path.as_posix())
+        val[:, 2:].tofile(saved_path.as_posix())
     elif key.find(".embed_tokens.weight") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"

From e6726ad99fd57c342a6668fce050e6b2f5defbe0 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:56:39 -0700
Subject: [PATCH 114/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 8620bdef2..80039d49b 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -67,7 +67,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
     if val.ndim == 2:
         val = val.transpose(1, 0)
     # LOGGER.debug(f"key: {key}, val.shape: {val.shape}")
-    print(f"key: {key}, val.shape: {val.shape} {val[:, 2:].shape}")
+    print(f"key: {key}, val.shape: {val.shape}")
 
     if key.find(".embed_positions.weight") != -1:
         if key.find("encoder") != -1:

From 264e41cf7b78eb135d3977c950cf5d87717010fa Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 22:58:25 -0700
Subject: [PATCH 115/262] commit

---
 src/fastertransformer/models/bart/BartDecoderLayerWeight.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index c2adb3b06..b01ce5ebe 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -289,7 +289,7 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
 
     loadWeightFromBin<T>(weights_ptr[0],
                          {weights_size[0]},
-                         dir_path + "layer.SelfAttention.final_layer_norm.weight." + tp_rank + ".bin",
+                         dir_path + "layer.SelfAttention.final_layer_norm.weight.bin",
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr[1],
                          {weights_size[1]},
@@ -301,7 +301,7 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr[3],
                          {weights_size[3]},
-                         dir_path + "layer.SelfAttention.attn_layer_norm.weight." + tp_rank + ".bin",
+                         dir_path + "layer.SelfAttention.attn_layer_norm.weight.bin",
                          model_file_type);
     loadWeightFromBin<T>(weights_ptr[4],
                          {weights_size[4]},

From 94ecf8df8f99f1aa37272e82ccf76f4e2ec8bac2 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 23:06:06 -0700
Subject: [PATCH 116/262] commit

---
 .../bart/utils/huggingface_bart_ckpt_convert.py   | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 80039d49b..6157b83ab 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -307,14 +307,15 @@ def convert_checkpoint(args):
     np_weight_data_type = get_weight_data_type(args.weight_data_type)
 
     i_gpu_num = args.inference_tensor_para_size
+    for name, param in bart_model.state_dict().items():
+        split_and_convert_process(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
+    # pool = multiprocessing.Pool(args.processes)
+    # pool.starmap_async(split_and_convert_process,
+    #                    [(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
+    #                     for name, param in bart_model.state_dict().items()])
 
-    pool = multiprocessing.Pool(args.processes)
-    pool.starmap_async(split_and_convert_process,
-                       [(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
-                        for name, param in bart_model.state_dict().items()])
-
-    pool.close()
-    pool.join()
+    # pool.close()
+    # pool.join()
 
     # fuse_decoder_qkv(bart_model, i_gpu_num, saved_dir, np_weight_data_type)
 

From 942d1906a0fdbfb9a7f1c85e9a53094c5b28b11e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 23:07:10 -0700
Subject: [PATCH 117/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 6157b83ab..6e4a990c4 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -207,11 +207,11 @@ def split_and_convert_process(key, val, factor, saved_dir):
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("encoder_attn_layer_norm.weight") != -1:
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.CrossAttention.attn_layer_norm.weight.bin"
+        saved_path = saved_dir / f"decoder.{layer}.layer.CrossAttention.attn_layer_norm.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("encoder_attn_layer_norm.bias") != -1:
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
-        saved_path = saved_dir / f"{prefix}.{layer}.layer.CrossAttention.attn_layer_norm.bias.bin"
+        saved_path = saved_dir / f"decoder.{layer}.layer.CrossAttention.attn_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("fc1.weight") != -1 or key.find("fc2.weight") != -1:
         if key.find("encoder") != -1:

From ccd18aaa2ba47bd783e0235029494799eace49dd Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 23:10:05 -0700
Subject: [PATCH 118/262] commit

---
 .../bart/utils/huggingface_bart_ckpt_convert.py      | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 6e4a990c4..d12ddeedc 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -169,14 +169,10 @@ def split_and_convert_process(key, val, factor, saved_dir):
         or key.find("encoder_attn.q_proj.weight") != -1
     ):
         split_vals = np.split(val, factor, axis=0)
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
         qkv = key.split('encoder_attn.')[1][:1]
         for j in range(factor):
-            saved_path = saved_dir / f"{prefix}.{layer}.layer.CrossAttention.{qkv}.weight.{j:d}.bin"
+            saved_path = saved_dir / f"decoder.{layer}.layer.CrossAttention.{qkv}.weight.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif (
         key.find("encoder_attn.k_proj.bias") != -1
@@ -184,14 +180,10 @@ def split_and_convert_process(key, val, factor, saved_dir):
         or key.find("encoder_attn.q_proj.bias") != -1
     ):
         split_vals = np.split(val, factor, axis=0)
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
         qkv = key.split('encoder_attn.')[1][:1]
         for j in range(factor):
-            saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{qkv}.bias.{j:d}.bin"
+            saved_path = saved_dir / f"decoder.{layer}.layer.SelfAttention.{qkv}.bias.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("encoder_attn.out_proj.weight") != -1:
         split_vals = np.split(val, factor, axis=0)

From 9820254a966dcfaea40c9677799b38be584f51f9 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 23:14:50 -0700
Subject: [PATCH 119/262] commit

---
 .../models/bart/BartDecoderLayerWeight.cc           | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index b01ce5ebe..aa114e7fc 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -276,17 +276,6 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
 
     const auto tp_rank = std::to_string(tensor_para_rank_);
 
-    layernorm_weights.gamma                               = weights_ptr[0];
-    self_attention_weights.query_weight.kernel            = weights_ptr[1];
-    self_attention_weights.attention_output_weight.kernel = weights_ptr[2];
-    self_attn_layernorm_weights.gamma                     = weights_ptr[3];
-
-    cross_attention_weights.query_weight.kernel            = weights_ptr[4];
-    cross_attention_weights.key_weight.kernel              = weights_ptr[5];
-    cross_attention_weights.value_weight.kernel            = weights_ptr[6];
-    cross_attention_weights.attention_output_weight.kernel = weights_ptr[7];
-    cross_attn_layernorm_weights.gamma                     = weights_ptr[8];
-
     loadWeightFromBin<T>(weights_ptr[0],
                          {weights_size[0]},
                          dir_path + "layer.SelfAttention.final_layer_norm.weight.bin",
@@ -329,7 +318,7 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
 
     loadWeightFromBin<T>(weights_ptr[9],
                         {weights_size[9]},
-                        dir_path + "layer.SelfAttention.fc2.weight.bin",
+                        dir_path + "layer.SelfAttention.fc2.weight." + tp_rank + ".bin",
                         model_file_type);
     loadWeightFromBin<T>(weights_ptr[10],
                         {weights_size[10]},

From bf3a61aa784216e4253e17fac63cf719f7373e5c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 23:20:07 -0700
Subject: [PATCH 120/262] commit

---
 .../models/bart/BartDecoderLayerWeight.cc                  | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index aa114e7fc..b8f389d3e 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -313,16 +313,13 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
                         dir_path + "layer.CrossAttention.attn_layer_norm.weight.bin",
                         model_file_type);
 
-        ffn_weights.intermediate_weight.kernel = weights_ptr[9];
-        ffn_weights.output_weight.kernel       = weights_ptr[10];
-
     loadWeightFromBin<T>(weights_ptr[9],
                         {weights_size[9]},
-                        dir_path + "layer.SelfAttention.fc2.weight." + tp_rank + ".bin",
+                        dir_path + "layer.SelfAttention.fc1.weight." + tp_rank + ".bin",
                         model_file_type);
     loadWeightFromBin<T>(weights_ptr[10],
                         {weights_size[10]},
-                        dir_path + "layer.SelfAttention.final_layer_norm.weight.bin",
+                        dir_path + "layer.SelfAttention.fc2.weight." + tp_rank + ".bin",
                         model_file_type);
 
     if (bart_with_bias_) {

From 3a3e6af7d12203bfaaacd6140fa473dd72ddd689 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Tue, 19 Sep 2023 23:21:08 -0700
Subject: [PATCH 121/262] commit

---
 .../models/bart/BartDecoderLayerWeight.cc         | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index b8f389d3e..2747d0b7f 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -323,6 +323,21 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
                         model_file_type);
 
     if (bart_with_bias_) {
+        /*
+                    layernorm_weights.beta                              = weights_ptr[11];
+            self_attention_weights.query_weight.bias            = weights_ptr[12];
+            self_attention_weights.attention_output_weight.bias = weights_ptr[13];
+            self_attn_layernorm_weights.beta                    = weights_ptr[14];
+
+            cross_attention_weights.query_weight.bias            = weights_ptr[15];
+            cross_attention_weights.key_weight.bias              = weights_ptr[16];
+            cross_attention_weights.value_weight.bias            = weights_ptr[17];
+            cross_attention_weights.attention_output_weight.bias = weights_ptr[18];
+            cross_attn_layernorm_weights.beta                    = weights_ptr[19];
+
+            ffn_weights.intermediate_weight.bias = weights_ptr[20];
+            ffn_weights.output_weight.bias       = weights_ptr[21];
+        */
         loadWeightFromBin<T>(weights_ptr[8],
                             {weights_size[8]},
                             dir_path + "layer.SelfAttention.q.bias." + tp_rank + ".bin",

From ea27dd82e480f9ca40161d11520aa8bc14c09360 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 09:54:51 -0700
Subject: [PATCH 122/262] commit

---
 .../models/bart/BartDecoderLayerWeight.cc     | 54 ++++++++++++-------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
index 2747d0b7f..e77b81b17 100644
--- a/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/bart/BartDecoderLayerWeight.cc
@@ -324,7 +324,7 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
 
     if (bart_with_bias_) {
         /*
-                    layernorm_weights.beta                              = weights_ptr[11];
+            layernorm_weights.beta                              = weights_ptr[11];
             self_attention_weights.query_weight.bias            = weights_ptr[12];
             self_attention_weights.attention_output_weight.bias = weights_ptr[13];
             self_attn_layernorm_weights.beta                    = weights_ptr[14];
@@ -338,39 +338,53 @@ void BartDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
             ffn_weights.intermediate_weight.bias = weights_ptr[20];
             ffn_weights.output_weight.bias       = weights_ptr[21];
         */
-        loadWeightFromBin<T>(weights_ptr[8],
-                            {weights_size[8]},
-                            dir_path + "layer.SelfAttention.q.bias." + tp_rank + ".bin",
-                            model_file_type);
-        loadWeightFromBin<T>(weights_ptr[9],
-                            {weights_size[9]},
-                            dir_path + "layer.SelfAttention.k.bias." + tp_rank + ".bin",
-                            model_file_type);
-        loadWeightFromBin<T>(weights_ptr[10],
-                            {weights_size[10]},
-                            dir_path + "layer.SelfAttention.v.bias." + tp_rank + ".bin",
-                            model_file_type);
         loadWeightFromBin<T>(weights_ptr[11],
                             {weights_size[11]},
-                            dir_path + "layer.SelfAttention.out_proj.bias." + tp_rank + ".bin",
+                            dir_path + "layer.SelfAttention.final_layer_norm.bias.bin",
                             model_file_type);
         loadWeightFromBin<T>(weights_ptr[12],
                             {weights_size[12]},
-                            dir_path + "layer.SelfAttention.attn_layer_norm.bias.bin",
+                            dir_path + "layer.SelfAttention.qkv.bias." + tp_rank + ".bin",
                             model_file_type);
-
         loadWeightFromBin<T>(weights_ptr[13],
                             {weights_size[13]},
-                            dir_path + "layer.SelfAttention.fc1.bias.bin",
+                            dir_path + "layer.SelfAttention.out_proj.bias." + tp_rank + ".bin",
                             model_file_type);
         loadWeightFromBin<T>(weights_ptr[14],
                             {weights_size[14]},
-                            dir_path + "layer.SelfAttention.fc2.bias.bin",
+                            dir_path + "layer.SelfAttention.attn_layer_norm.bias.bin",
                             model_file_type);
+
         loadWeightFromBin<T>(weights_ptr[15],
                             {weights_size[15]},
-                            dir_path + "layer.SelfAttention.final_layer_norm.bias.bin",
-                            model_file_type);       
+                            dir_path + "layer.CrossAttention.q.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr[16],
+                            {weights_size[16]},
+                            dir_path + "layer.CrossAttention.k.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr[17],
+                            {weights_size[17]},
+                            dir_path + "layer.CrossAttention.v.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr[18],
+                            {weights_size[18]},
+                            dir_path + "layer.CrossAttention.out_proj.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr[19],
+                            {weights_size[19]},
+                            dir_path + "layer.CrossAttention.attn_layer_norm.bias.bin",
+                            model_file_type);
+
+        loadWeightFromBin<T>(weights_ptr[20],
+                            {weights_size[20]},
+                            dir_path + "layer.SelfAttention.fc1.bias." + tp_rank + ".bin",
+                            model_file_type);
+        loadWeightFromBin<T>(weights_ptr[21],
+                            {weights_size[21]},
+                            dir_path + "layer.SelfAttention.fc2.bias." + tp_rank + ".bin",
+                            model_file_type);
+  
     }
 
     FT_LOG_DEBUG("BartDecoderLayerWeight " + std::string(__func__) + " end");

From a5fc8ef25e8ae6fb9b20166c1c5dc3cc32df94d1 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 09:59:33 -0700
Subject: [PATCH 123/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index d12ddeedc..81c3eae47 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -183,7 +183,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
         layer = int(key.split('layers.')[1].split('.encoder_attn')[0])
         qkv = key.split('encoder_attn.')[1][:1]
         for j in range(factor):
-            saved_path = saved_dir / f"decoder.{layer}.layer.SelfAttention.{qkv}.bias.{j:d}.bin"
+            saved_path = saved_dir / f"decoder.{layer}.layer.CrossAttention.{qkv}.bias.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("encoder_attn.out_proj.weight") != -1:
         split_vals = np.split(val, factor, axis=0)

From 1db101c82fa703850380a890a9bc753f25890e5c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:05:09 -0700
Subject: [PATCH 124/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 30 ++++++++++++++-----
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 81c3eae47..af167324c 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -45,21 +45,37 @@ def get_weight_data_type(data_type):
 def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
     model_dict = {}
     for name, param in model.named_parameters():
-        if name.find("decoder") != -1 and name.find("SelfAttention") != -1:
+        if name.find("encoder_attn") == -1:
+            continue
+        if name.find(".q_proj.") != -1 or name.find(".k_proj.") != -1 or name.find(".v_proj.") != -1:
             model_dict[name] = param
 
-    for i in range(model.decoder.config.num_layers):
-        shape = model_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"].T.shape
-        qkv = torch.cat([model_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"].T,
-                         model_dict[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"].T,
-                         model_dict[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"].T], dim=-1)
+    for i in range(model.decoder_layers):
+        shape = model_dict[f"model.decoder.layers.{i}.encoder_attn.q_proj.weight"].T.shape
+        qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.encoder_attn.q_proj.weight"].T,
+                         model_dict[f"model.decoder.layers.{i}.encoder_attn.k_proj.weight"].T,
+                         model_dict[f"model.decoder.layers.{i}.encoder_attn.v_proj.weight"].T], dim=-1)
 
         qkv = qkv.reshape([shape[0], 3, shape[1]])
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
 
         split_vals = np.split(qkv, factor, axis=-1)
         for j in range(factor):
-            saved_path = saved_dir / f"decoder.block.{i}.layer.0.SelfAttention.qkv.weight.{j}.bin"
+            saved_path = saved_dir / f"decoder.{i}.layer.SelfAttention.qkv.weight.{j}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+
+    for i in range(model.decoder_layers):
+        shape = model_dict[f"model.decoder.layers.{i}.encoder_attn.q_proj.bias"].shape
+        qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.encoder_attn.q_proj.bias"],
+                         model_dict[f"model.decoder.layers.{i}.encoder_attn.k_proj.bias"],
+                         model_dict[f"model.decoder.layers.{i}.encoder_attn.v_proj.bias"]], dim=-1)
+
+        qkv = qkv.reshape([shape[0], 3, shape[1]])
+        qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
+
+        split_vals = np.split(qkv, factor, axis=-1)
+        for j in range(factor):
+            saved_path = saved_dir / f"decoder.{i}.layer.SelfAttention.qkv.bias.{j}.bin"
             split_vals[j].tofile(saved_path.as_posix())
 
 

From 6cc42df9f3a1b2dbd843f7b88d29dd1f61814ee9 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:06:23 -0700
Subject: [PATCH 125/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py     | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index af167324c..a3ab1467b 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -45,16 +45,16 @@ def get_weight_data_type(data_type):
 def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
     model_dict = {}
     for name, param in model.named_parameters():
-        if name.find("encoder_attn") == -1:
+        if name.find("self_attn") == -1 or name.find("decoder.layers") == -1:
             continue
         if name.find(".q_proj.") != -1 or name.find(".k_proj.") != -1 or name.find(".v_proj.") != -1:
             model_dict[name] = param
 
     for i in range(model.decoder_layers):
-        shape = model_dict[f"model.decoder.layers.{i}.encoder_attn.q_proj.weight"].T.shape
-        qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.encoder_attn.q_proj.weight"].T,
-                         model_dict[f"model.decoder.layers.{i}.encoder_attn.k_proj.weight"].T,
-                         model_dict[f"model.decoder.layers.{i}.encoder_attn.v_proj.weight"].T], dim=-1)
+        shape = model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"].T.shape
+        qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"].T,
+                         model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"].T,
+                         model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"].T], dim=-1)
 
         qkv = qkv.reshape([shape[0], 3, shape[1]])
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
@@ -65,10 +65,10 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
             split_vals[j].tofile(saved_path.as_posix())
 
     for i in range(model.decoder_layers):
-        shape = model_dict[f"model.decoder.layers.{i}.encoder_attn.q_proj.bias"].shape
-        qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.encoder_attn.q_proj.bias"],
-                         model_dict[f"model.decoder.layers.{i}.encoder_attn.k_proj.bias"],
-                         model_dict[f"model.decoder.layers.{i}.encoder_attn.v_proj.bias"]], dim=-1)
+        shape = model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"].shape
+        qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"],
+                         model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"],
+                         model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"]], dim=-1)
 
         qkv = qkv.reshape([shape[0], 3, shape[1]])
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)

From ef0efb17afac8b5989aa25a135c0d936d0b9a08d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:08:05 -0700
Subject: [PATCH 126/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index a3ab1467b..b4ce2d521 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -123,6 +123,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
             prefix = "encoder"
         else:
             prefix = "decoder"
+            return
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         qkv = key.split('self_attn.')[1][:1]
         for j in range(factor):
@@ -138,6 +139,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
             prefix = "encoder"
         else:
             prefix = "decoder"
+            return
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         qkv = key.split('self_attn.')[1][:1]
         for j in range(factor):

From 43896aebb09bdc3271873519f0ce36bae367fe29 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:09:12 -0700
Subject: [PATCH 127/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index b4ce2d521..3421f18c8 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -327,7 +327,7 @@ def convert_checkpoint(args):
     # pool.close()
     # pool.join()
 
-    # fuse_decoder_qkv(bart_model, i_gpu_num, saved_dir, np_weight_data_type)
+    fuse_decoder_qkv(bart_model, i_gpu_num, saved_dir, np_weight_data_type)
 
 
 if __name__ == "__main__":

From afe6da8710ec9900db664379106692ae7c3ea0e4 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:10:02 -0700
Subject: [PATCH 128/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 3421f18c8..0b874fb88 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -50,7 +50,7 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
         if name.find(".q_proj.") != -1 or name.find(".k_proj.") != -1 or name.find(".v_proj.") != -1:
             model_dict[name] = param
 
-    for i in range(model.decoder_layers):
+    for i in range(model.config.decoder_layers):
         shape = model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"].T.shape
         qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"].T,
                          model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"].T,
@@ -64,7 +64,7 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
             saved_path = saved_dir / f"decoder.{i}.layer.SelfAttention.qkv.weight.{j}.bin"
             split_vals[j].tofile(saved_path.as_posix())
 
-    for i in range(model.decoder_layers):
+    for i in range(model.config.decoder_layers):
         shape = model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"].shape
         qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"],
                          model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"],

From 7df4b83b6a6275e9be6735e34fa1858e399ea658 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:10:59 -0700
Subject: [PATCH 129/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 0b874fb88..66912a592 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -69,7 +69,7 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
         qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"],
                          model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"],
                          model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"]], dim=-1)
-
+        print(qkv.shape)
         qkv = qkv.reshape([shape[0], 3, shape[1]])
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
 

From 50e76ddedcb1947fa692b1ae09e425d4d8590b7c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:13:03 -0700
Subject: [PATCH 130/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 66912a592..5118db66b 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -70,7 +70,7 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
                          model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"],
                          model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"]], dim=-1)
         print(qkv.shape)
-        qkv = qkv.reshape([shape[0], 3, shape[1]])
+        qkv = qkv.reshape([shape[0], 3])
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
 
         split_vals = np.split(qkv, factor, axis=-1)

From 7a707139eaa19cd0e0d058ce47619c21002abf29 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:13:16 -0700
Subject: [PATCH 131/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 5118db66b..e671b3408 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -69,7 +69,6 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
         qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"],
                          model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"],
                          model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"]], dim=-1)
-        print(qkv.shape)
         qkv = qkv.reshape([shape[0], 3])
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
 

From f69c2ff7f38fb55651fc8d8762622591b19dc5f5 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:14:49 -0700
Subject: [PATCH 132/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index e0a43f81a..3f2a8028b 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -358,6 +358,7 @@ int main(int argc, char* argv[])
     // step 5: Forward
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> output_tensors_lists(
         (size_t)gpu_count);
+    printf("[INFO] gpu_count: %d\n", gpu_count);
     for (int i = 0; i < 2; i++) {
         threads.clear();
         for (int device_id = 0; device_id < gpu_count; device_id++) {

From 1aa3334778a2c8cac04c03f1a4ef62e781a795cc Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:15:48 -0700
Subject: [PATCH 133/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 3f2a8028b..4797c1a9a 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -358,7 +358,7 @@ int main(int argc, char* argv[])
     // step 5: Forward
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> output_tensors_lists(
         (size_t)gpu_count);
-    printf("[INFO] gpu_count: %d\n", gpu_count);
+    printf("[INFO] gpu_count: %d %d %d %d\n", gpu_count, model_instances.size(), request_list.size(), output_tensors_lists.size());
     for (int i = 0; i < 2; i++) {
         threads.clear();
         for (int device_id = 0; device_id < gpu_count; device_id++) {

From 92bcfd951e8133a7e5106a69c11409507790809c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:16:47 -0700
Subject: [PATCH 134/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index efaf99387..ea4970dce 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -101,6 +101,7 @@ template<typename T>
 std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
 BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
 {
+    printf("BartTritonModelInstance<T>::forward\n");
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];
     const size_t max_output_len     = *((uint*)input_tensors->at("max_output_len").data);

From cc4efd0be7d91954a2ce1a563705a51ca86b8679 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:21:22 -0700
Subject: [PATCH 135/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index ea4970dce..01f0212ab 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -108,6 +108,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
     const size_t beam_width =
         input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1;
 
+    printf("allocateBuffer\n");
     allocateBuffer(request_batch_size, beam_width, max_output_len, mem_max_seq_len);
 
     ft::TensorMap encoder_input_tensors(convert_inputs(input_tensors));

From 03aa995de06a3b033330ae9d46feaa46de896391 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:22:11 -0700
Subject: [PATCH 136/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 01f0212ab..33ea20c84 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -102,6 +102,7 @@ std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
 BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
 {
     printf("BartTritonModelInstance<T>::forward\n");
+    printf("input_tensors input_ids %d", input_tensors->at("input_ids").shape.size());
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];
     const size_t max_output_len     = *((uint*)input_tensors->at("max_output_len").data);

From ae66ba03df039eb8cf2d4179f0c62dc1c5d1a9b8 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:25:39 -0700
Subject: [PATCH 137/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 4797c1a9a..ddfa0eaf5 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -243,7 +243,7 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
                        end_id,
                        1,
                        "../examples/cpp/bart/start_ids.csv");
-
+    printf("v_start_ids size: %d", v_start_ids.size());
     std::vector<int> v_bad_words;
     ft::read_word_list("../examples/cpp/bart/bad_words.csv", v_bad_words);
 

From 321667ccfe93251e11a6a64246482dd57f230b9f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:34:14 -0700
Subject: [PATCH 138/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index ddfa0eaf5..dbf71494f 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -73,7 +73,7 @@ broadCastRequest(const std::vector<int>& v_start_ids,
     }
     ft::mpi::barrier();
 
-    int request_batch_size = size_2;
+    int request_batch_size = 8;
     int max_input_len      = size_1 / size_2;
 
     ft::mpi::bcast(v_input_ids.data(), size_1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);

From e0cf38fe40f4540b819cefda40dd284ff5343f1f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:35:11 -0700
Subject: [PATCH 139/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index dbf71494f..637655487 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -243,7 +243,8 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
                        end_id,
                        1,
                        "../examples/cpp/bart/start_ids.csv");
-    printf("v_start_ids size: %d", v_start_ids.size());
+    printf("v_start_ids size: %d v_start_lengths size: %d\n", v_start_ids.size(), v_start_lengths.size());
+
     std::vector<int> v_bad_words;
     ft::read_word_list("../examples/cpp/bart/bad_words.csv", v_bad_words);
 

From 3f5db1e5a903a5ef3ff09042279ded9baadad5e4 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:35:42 -0700
Subject: [PATCH 140/262] commit

---
 examples/cpp/bart/start_ids.csv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/bart/start_ids.csv b/examples/cpp/bart/start_ids.csv
index 6b8b9c375..d1ed9fb33 100644
--- a/examples/cpp/bart/start_ids.csv
+++ b/examples/cpp/bart/start_ids.csv
@@ -1,8 +1,8 @@
 1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
-1, 18637
 1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
 1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
 1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
 1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
 1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
-1, 18637, 29892, 526, 366, 1136
+1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973

From 9cf33f52813fa6af7b30e3fe5ef2b4668f54bebd Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:57:33 -0700
Subject: [PATCH 141/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc           | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 33ea20c84..d3b21785f 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -102,6 +102,11 @@ std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
 BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
 {
     printf("BartTritonModelInstance<T>::forward\n");
+    for (const auto& pair : *input_tensors) {
+        int key = pair.first;
+        std::cout << "Key: " << key << std::endl;
+    }
+
     printf("input_tensors input_ids %d", input_tensors->at("input_ids").shape.size());
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];

From 8583d776b686598be9bc74ba80f01054de141eed Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 10:57:58 -0700
Subject: [PATCH 142/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc             | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index d3b21785f..47ccee60d 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -103,8 +103,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
 {
     printf("BartTritonModelInstance<T>::forward\n");
     for (const auto& pair : *input_tensors) {
-        int key = pair.first;
-        std::cout << "Key: " << key << std::endl;
+        std::cout << "Key: " << pair.first << std::endl;
     }
 
     printf("input_tensors input_ids %d", input_tensors->at("input_ids").shape.size());

From 2755984d985b3d1e26fbb1aecd377dcc808996a1 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:27:08 -0700
Subject: [PATCH 143/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 47ccee60d..d32639397 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -106,7 +106,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
         std::cout << "Key: " << pair.first << std::endl;
     }
 
-    printf("input_tensors input_ids %d", input_tensors->at("input_ids").shape.size());
+    printf("input_tensors input_ids %d", input_tensors->at("input_lengths").shape.size());
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];
     const size_t max_output_len     = *((uint*)input_tensors->at("max_output_len").data);

From f3bfdb99dcf31100c2454f78999ff2e4c6881afd Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:27:36 -0700
Subject: [PATCH 144/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc             | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index d32639397..e1339cf3f 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -106,7 +106,8 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
         std::cout << "Key: " << pair.first << std::endl;
     }
 
-    printf("input_tensors input_ids %d", input_tensors->at("input_lengths").shape.size());
+    printf("input_tensors input_ids %d", 2);
+    input_tensors->at("input_lengths");
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];
     const size_t max_output_len     = *((uint*)input_tensors->at("max_output_len").data);

From c0fc6fa6a5dd9ce47bef7726e867eb90554b31ba Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:28:39 -0700
Subject: [PATCH 145/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index e1339cf3f..0fa64e7b3 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -106,7 +106,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
         std::cout << "Key: " << pair.first << std::endl;
     }
 
-    printf("input_tensors input_ids %d", 2);
+    printf("input_tensors input_ids");
     input_tensors->at("input_lengths");
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];

From 9d9b899540329d575cc3fc0c3585c86caebc9ad1 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:29:05 -0700
Subject: [PATCH 146/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 0fa64e7b3..0aa93bed6 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -107,6 +107,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
     }
 
     printf("input_tensors input_ids");
+    return;
     input_tensors->at("input_lengths");
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];

From e9ab2685f0fef0a9395c03f6cd1dad2e5b07b7bf Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:29:23 -0700
Subject: [PATCH 147/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 0aa93bed6..220c7ce19 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -107,7 +107,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
     }
 
     printf("input_tensors input_ids");
-    return;
+    return nullptr;
     input_tensors->at("input_lengths");
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];

From c8b3c1f0e1ff275d92ad61c848fd9bea6fbbe93c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:31:00 -0700
Subject: [PATCH 148/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc             | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 220c7ce19..d31ed9418 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -106,8 +106,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
         std::cout << "Key: " << pair.first << std::endl;
     }
 
-    printf("input_tensors input_ids");
-    return nullptr;
+    printf("input_tensors input_ids\n");
     input_tensors->at("input_lengths");
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];

From af5e2d0b21d7c9e06dc93afe47b55f22a1fd98a3 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:31:43 -0700
Subject: [PATCH 149/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index d31ed9418..a0e4c2054 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -104,6 +104,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
     printf("BartTritonModelInstance<T>::forward\n");
     for (const auto& pair : *input_tensors) {
         std::cout << "Key: " << pair.first << std::endl;
+        input_tensors->at(pair.first);
     }
 
     printf("input_tensors input_ids\n");

From 1290559865354280f8ca3a14fd983b340c1aa254 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:32:14 -0700
Subject: [PATCH 150/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index a0e4c2054..7482ff1fd 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -109,6 +109,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
 
     printf("input_tensors input_ids\n");
     input_tensors->at("input_lengths");
+    printf("done\n");
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];
     const size_t max_output_len     = *((uint*)input_tensors->at("max_output_len").data);

From 6ab81bbc837b84bb6fab400903642249fd2e731f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:32:56 -0700
Subject: [PATCH 151/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 637655487..0f62b9d68 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -130,7 +130,7 @@ broadCastRequest(const std::vector<int>& v_start_ids,
                                 triton::TYPE_INT32,
                                 std::vector<size_t>{(size_t)request_batch_size},
                                 d_input_lengths}},
-                {"request_output_len",
+                {"max_output_len",
                  triton::Tensor{triton::MEMORY_CPU,
                                 triton::TYPE_INT32,
                                 std::vector<size_t>{(size_t)request_batch_size},

From f7d61817751ccaeb772a25ceb7be43de51a63c22 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:34:34 -0700
Subject: [PATCH 152/262] commit

---
 examples/cpp/bart/bart_triton_example.cc         |  2 +-
 .../bart/BartTritonModelInstance.cc              | 16 ----------------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 0f62b9d68..4bf1f7b17 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -125,7 +125,7 @@ broadCastRequest(const std::vector<int>& v_start_ids,
                                 triton::TYPE_INT32,
                                 std::vector<size_t>{(size_t)request_batch_size, (size_t)max_input_len},
                                 d_input_ids}},
-                {"input_lengths",
+                {"sequence_length",
                  triton::Tensor{triton::MEMORY_GPU,
                                 triton::TYPE_INT32,
                                 std::vector<size_t>{(size_t)request_batch_size},
diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 7482ff1fd..4916f0e6a 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -64,22 +64,6 @@ BartTritonModelInstance<T>::convert_inputs(std::shared_ptr<std::unordered_map<st
         {{"input_ids", as_GPU_tensor(input_tensors->at("input_ids"), d_input_ids_)},
          {"sequence_length", as_GPU_tensor(input_tensors->at("sequence_length"), d_input_lengths_)}});
 
-    if (input_tensors->count("prompt_learning_task_name_ids")) {
-        ft_input_tensors.insert({"prompt_learning_task_name_ids",
-                                 input_tensors->at("prompt_learning_task_name_ids").convertTritonTensorToFt()});
-    }
-    if (input_tensors->count("request_prompt_lengths")) {
-        move_tensor_H2D(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_, &allocator_);
-        ft_input_tensors.insert(
-            {"request_prompt_lengths",
-             as_GPU_tensor(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_)});
-    }
-    if (input_tensors->count("request_prompt_embedding")) {
-        move_tensor_H2D(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_, &allocator_);
-        ft_input_tensors.insert(
-            {"request_prompt_embedding",
-             as_GPU_tensor(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_)});
-    }
     return ft_input_tensors;
 }
 

From c24d988f81133e9fdc192403b89a732ccb5fc342 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:35:08 -0700
Subject: [PATCH 153/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 4916f0e6a..848bdcb66 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -104,7 +104,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
     allocateBuffer(request_batch_size, beam_width, max_output_len, mem_max_seq_len);
 
     ft::TensorMap encoder_input_tensors(convert_inputs(input_tensors));
-
+    printf("encoder_input_tensors\n");
     ft::TensorMap encoder_output_tensors(
         {{"output_hidden_state",
           ft::Tensor{ft::MEMORY_GPU,

From 724f09fe7d40d6dc6e83234aa0fcab3725540628 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:36:40 -0700
Subject: [PATCH 154/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc               | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 848bdcb66..834ce2edd 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -92,7 +92,6 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
     }
 
     printf("input_tensors input_ids\n");
-    input_tensors->at("input_lengths");
     printf("done\n");
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t mem_max_seq_len    = input_tensors->at("input_ids").shape[1];

From 1fb251f26ac4787afe6489263e5bea51889f7125 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:37:53 -0700
Subject: [PATCH 155/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 4bf1f7b17..233f257a8 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -375,6 +375,11 @@ int main(int argc, char* argv[])
     }
     printf("[INFO] forward is completed. \n");
 
+    for (const auto& pair : *output_tensors_lists[0]) {
+        std::cout << "Key: " << pair.first << std::endl;
+        input_tensors->at(pair.first);
+    }
+
     const int* d_output_ids = (const int*)output_tensors_lists[0].get()->at("output_ids").data;
     const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
     const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];

From 03d91571e15cb8894af50dfbe34a555b75bd45c9 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:38:06 -0700
Subject: [PATCH 156/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 233f257a8..afead44d7 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -377,7 +377,6 @@ int main(int argc, char* argv[])
 
     for (const auto& pair : *output_tensors_lists[0]) {
         std::cout << "Key: " << pair.first << std::endl;
-        input_tensors->at(pair.first);
     }
 
     const int* d_output_ids = (const int*)output_tensors_lists[0].get()->at("output_ids").data;

From 017b5f5ddb788998c4410c141856e193451e3c15 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:39:28 -0700
Subject: [PATCH 157/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index afead44d7..8991c2e8c 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -383,7 +383,7 @@ int main(int argc, char* argv[])
     const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
     const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
     const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
-    const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("input_lengths").data;
+    const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("sequence_length").data;
     // step 6: check results
     if (node_id == 0) {
 

From 23ce1b892d658af35216eb5f7510b2472be042d8 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:41:44 -0700
Subject: [PATCH 158/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 8991c2e8c..16272d10d 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -416,6 +416,9 @@ int main(int argc, char* argv[])
 
                     // if (i < 10)
                         printf("%5d ", hBuf[i]);
+                        if ((i + 1) % (seq_len) == 0) {
+                            printf("\n");
+                        }
                     // if ((i + 1) % (seq_len) == 0 && i < 10)
                     //     std::cout << std::endl;
                 }

From 7c523601c6c29f164dc3a7da11446fa65e9e3344 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:53:41 -0700
Subject: [PATCH 159/262] commit

---
 examples/cpp/bart/start_ids.csv | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/cpp/bart/start_ids.csv b/examples/cpp/bart/start_ids.csv
index d1ed9fb33..b52a6b1ee 100644
--- a/examples/cpp/bart/start_ids.csv
+++ b/examples/cpp/bart/start_ids.csv
@@ -1,8 +1,8 @@
-1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
-1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
-1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
-1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
-1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
-1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
-1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
-1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
+0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
+0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
+0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
+0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
+0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
+0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
+0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2

From 2bec74c2aa5436e7997e2f87657c856fb5b6a742 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:54:48 -0700
Subject: [PATCH 160/262] commit

---
 examples/cpp/multi_gpu_gpt/gpt_example_utils.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
index 578fbc90b..27bc168a8 100644
--- a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
+++ b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
@@ -91,6 +91,9 @@ int read_start_ids(size_t            batch_size,
             v_start_lengths->push_back(tmp_start_lengths[i]);
         }
     }
+    for (const i : v_start_lengths) {
+        printf("%d\n");
+    }
     return batch_size;
 }
 

From e28178d3b4740c58425f574f6380e5cc4cdd18b1 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:54:58 -0700
Subject: [PATCH 161/262] commit

---
 examples/cpp/multi_gpu_gpt/gpt_example_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
index 27bc168a8..494c19599 100644
--- a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
+++ b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
@@ -91,7 +91,7 @@ int read_start_ids(size_t            batch_size,
             v_start_lengths->push_back(tmp_start_lengths[i]);
         }
     }
-    for (const i : v_start_lengths) {
+    for (const auto i : v_start_lengths) {
         printf("%d\n");
     }
     return batch_size;

From ca44022f6dc393b695d192865b1a47cc19f23825 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:56:59 -0700
Subject: [PATCH 162/262] commit

---
 examples/cpp/multi_gpu_gpt/gpt_example_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
index 494c19599..4fe776aac 100644
--- a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
+++ b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
@@ -91,7 +91,7 @@ int read_start_ids(size_t            batch_size,
             v_start_lengths->push_back(tmp_start_lengths[i]);
         }
     }
-    for (const auto i : v_start_lengths) {
+    for (auto i : v_start_lengths) {
         printf("%d\n");
     }
     return batch_size;

From 79324796d105d8a3b3967a0996bae469bb8e4af7 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:57:05 -0700
Subject: [PATCH 163/262] commit

---
 examples/cpp/multi_gpu_gpt/gpt_example_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
index 4fe776aac..463466655 100644
--- a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
+++ b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
@@ -91,7 +91,7 @@ int read_start_ids(size_t            batch_size,
             v_start_lengths->push_back(tmp_start_lengths[i]);
         }
     }
-    for (auto i : v_start_lengths) {
+    for (auto i : *v_start_lengths) {
         printf("%d\n");
     }
     return batch_size;

From 1be0d1108a667b8ba3a509da04bd7b0be0dd1fc9 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:58:00 -0700
Subject: [PATCH 164/262] commit

---
 examples/cpp/multi_gpu_gpt/gpt_example_utils.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
index 463466655..d4c28d8b4 100644
--- a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
+++ b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
@@ -91,8 +91,8 @@ int read_start_ids(size_t            batch_size,
             v_start_lengths->push_back(tmp_start_lengths[i]);
         }
     }
-    for (auto i : *v_start_lengths) {
-        printf("%d\n");
+    for (int i : *v_start_lengths) {
+        printf("v_start_lengths %d\n");
     }
     return batch_size;
 }

From 6796bd3fc3f167eb10c880d44a04b46d0b3272a6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 21:59:17 -0700
Subject: [PATCH 165/262] commit

---
 examples/cpp/multi_gpu_gpt/gpt_example_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
index d4c28d8b4..3dc2960e8 100644
--- a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
+++ b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
@@ -92,7 +92,7 @@ int read_start_ids(size_t            batch_size,
         }
     }
     for (int i : *v_start_lengths) {
-        printf("v_start_lengths %d\n");
+        printf("v_start_lengths %d\n", i);
     }
     return batch_size;
 }

From e838d992614c01bfcc90d762aee56ccbcca47a2a Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:06:20 -0700
Subject: [PATCH 166/262] commit

---
 .../bart/BartTritonModelInstance.cc              | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 834ce2edd..a955b8bd4 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -184,6 +184,22 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
         }
 
         bart_encoder_->forward(&encoder_output_tensors, &encoder_input_tensors, bart_encoder_weight_.get());
+
+
+{
+        T* buf;
+        int st = request_batch_size * mem_max_seq_len * bart_encoder_->getDModel();
+        buf = new T[st];
+        cudaMemcpy(buf, d_encoder_outputs_, sizeof(T) * st, cudaMemcpyDeviceToHost);
+        printf("cudaMemcpy\n");
+        for (int i=0; i<st; i++) {
+            printf("%f ", double(buf[i]));
+            if (i % 500 == 499 ) {
+                printf("\n");
+            }
+        }
+}
+
         bart_decoding_->forward(&decoding_output_tensors, &decoding_input_tensors, bart_decoding_weight_.get());
 
         if (stream_cb_ != nullptr) {

From f2176d923ae4df8763e2bef794a99a3c47cd2228 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:07:33 -0700
Subject: [PATCH 167/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index a955b8bd4..c150c7488 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -192,9 +192,9 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
         buf = new T[st];
         cudaMemcpy(buf, d_encoder_outputs_, sizeof(T) * st, cudaMemcpyDeviceToHost);
         printf("cudaMemcpy\n");
-        for (int i=0; i<st; i++) {
+        for (int i=0; i<10; i++) {
             printf("%f ", double(buf[i]));
-            if (i % 500 == 499 ) {
+            if (i % 500 == 10 ) {
                 printf("\n");
             }
         }

From 660bc49c4d715464c01431475b6ad7ebf812751b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:08:00 -0700
Subject: [PATCH 168/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index c150c7488..1ae9fffb1 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -198,6 +198,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
                 printf("\n");
             }
         }
+        printf("\n");
 }
 
         bart_decoding_->forward(&decoding_output_tensors, &decoding_input_tensors, bart_decoding_weight_.get());

From e2b06f91e93c3d7556c1860713a5f139a41ed56d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:12:23 -0700
Subject: [PATCH 169/262] commit

---
 .vscode/settings.json                            | 3 ++-
 src/fastertransformer/models/bart/BartEncoder.cc | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index efbb5fbf3..655836ef1 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -83,6 +83,7 @@
         "charconv": "cpp",
         "ios": "cpp",
         "locale": "cpp",
-        "variant": "cpp"
+        "variant": "cpp",
+        "__memory": "cpp"
     }
 }
diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index bc55b9e45..aea6d352f 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -368,6 +368,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
     }
     std::string  input_tensor_name  = use_inputs_embeds ? "inputs_embeds" : "input_ids";
+    printf("input_tensor_name: %s\n", input_tensor_name.c_str());
     const size_t request_batch_size = input_tensors->at(input_tensor_name).shape[0];
     const size_t request_seq_len    = input_tensors->at(input_tensor_name).shape[1];
     const bool   return_attentions  = output_tensors->at("output_attentions", {}).size();

From f168f5a8d1797c0c0d90068d0d26af64dd443f11 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:15:49 -0700
Subject: [PATCH 170/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index aea6d352f..5ab9be75d 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -452,6 +452,21 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
             }
         }
 
+{
+        T* buf;
+        int st = max_batch_size_ * max_seq_len_ * d_model_;
+        buf = new T[st];
+        cudaMemcpy(buf, bart_encoder_emb_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
+        printf("cudaMemcpy\n");
+        for (int i=0; i<10; i++) {
+            printf("%f ", double(buf[i]));
+            if (i % 500 == 10 ) {
+                printf("\n");
+            }
+        }
+        printf("\n");
+}
+
         sync_check_cuda_error();
 
         size_t  h_token_num;

From b8022c7ccd574703280631cbde837a896eb3260c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:15:57 -0700
Subject: [PATCH 171/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 5ab9be75d..08e8b844d 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -457,7 +457,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         int st = max_batch_size_ * max_seq_len_ * d_model_;
         buf = new T[st];
         cudaMemcpy(buf, bart_encoder_emb_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
-        printf("cudaMemcpy\n");
+        printf("bart_encoder_emb_buf_\n");
         for (int i=0; i<10; i++) {
             printf("%f ", double(buf[i]));
             if (i % 500 == 10 ) {

From 0082ce6736b4a91400ec6b2d34f61e2f5ccddeef Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:16:42 -0700
Subject: [PATCH 172/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 08e8b844d..e3f621da2 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -452,6 +452,8 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
             }
         }
 
+        sync_check_cuda_error();
+
 {
         T* buf;
         int st = max_batch_size_ * max_seq_len_ * d_model_;
@@ -466,9 +468,6 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         }
         printf("\n");
 }
-
-        sync_check_cuda_error();
-
         size_t  h_token_num;
         T*      bart_encoder_input_ptr;
         T*      bart_encoder_output_ptr;

From 89bca5d774c69076bfb8af4b0ba6a35a52e1e118 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:29:37 -0700
Subject: [PATCH 173/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index e3f621da2..8f7b661dd 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -416,6 +416,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         const int* sequence_lengths = input_tensors->at("sequence_length").getPtr<int>() + id_offset;
 
         if (position_embedding_type == PositionEmbeddingType::absolute) {
+            printf("invokeInputIdsEmbeddingLookupPosEncoding\n");
             invokeInputIdsEmbeddingLookupPosEncoding(
                 bart_encoder_emb_buf_,
                 nullptr,

From 30fc8489263ca49dec358ffab614b3eae4f2aec8 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:54:36 -0700
Subject: [PATCH 174/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 8f7b661dd..120558e7b 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -458,6 +458,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
 {
         T* buf;
         int st = max_batch_size_ * max_seq_len_ * d_model_;
+        printf("st: %d\n", st);
         buf = new T[st];
         cudaMemcpy(buf, bart_encoder_emb_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
         printf("bart_encoder_emb_buf_\n");

From f26673a22142eaede4785f44ea4815a3d950d493 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:55:31 -0700
Subject: [PATCH 175/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 120558e7b..b622382de 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -458,7 +458,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
 {
         T* buf;
         int st = max_batch_size_ * max_seq_len_ * d_model_;
-        printf("st: %d\n", st);
+        printf("st: %d %d %d %d\n",max_batch_size_, max_seq_len_, d_model_, st);
         buf = new T[st];
         cudaMemcpy(buf, bart_encoder_emb_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
         printf("bart_encoder_emb_buf_\n");

From fde7e7a8f12d8997d28347d948328f28d0a38373 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 22:58:24 -0700
Subject: [PATCH 176/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index b622382de..bbc674490 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -457,8 +457,10 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
 
 {
         T* buf;
-        int st = max_batch_size_ * max_seq_len_ * d_model_;
-        printf("st: %d %d %d %d\n",max_batch_size_, max_seq_len_, d_model_, st);
+        int batch_size = 8;
+        int seq_len = 11;
+        int st = batch_size * seq_len * d_model_;
+        printf("st: %d %d %d %d\n",batch_size, seq_len, d_model_, st);
         buf = new T[st];
         cudaMemcpy(buf, bart_encoder_emb_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
         printf("bart_encoder_emb_buf_\n");

From e6fa60744b613609c70d93ad625a0a88a405cfbf Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 23:03:08 -0700
Subject: [PATCH 177/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 examples/cpp/bart/start_ids.csv          | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 16272d10d..c4e2e4c0e 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -73,7 +73,7 @@ broadCastRequest(const std::vector<int>& v_start_ids,
     }
     ft::mpi::barrier();
 
-    int request_batch_size = 8;
+    int request_batch_size = 1;
     int max_input_len      = size_1 / size_2;
 
     ft::mpi::bcast(v_input_ids.data(), size_1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
diff --git a/examples/cpp/bart/start_ids.csv b/examples/cpp/bart/start_ids.csv
index b52a6b1ee..8a78f4e99 100644
--- a/examples/cpp/bart/start_ids.csv
+++ b/examples/cpp/bart/start_ids.csv
@@ -1,8 +1 @@
 0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
-0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
-0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
-0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
-0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
-0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
-0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
-0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2

From d2c9a9a69462ffecbfeecaad56237d4345e3f157 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 23:03:16 -0700
Subject: [PATCH 178/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index bbc674490..49f73baaf 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -457,7 +457,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
 
 {
         T* buf;
-        int batch_size = 8;
+        int batch_size = 1;
         int seq_len = 11;
         int st = batch_size * seq_len * d_model_;
         printf("st: %d %d %d %d\n",batch_size, seq_len, d_model_, st);

From 2642aae0870fbf6fd47113d860a2fccd3ab03c0c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 23:03:29 -0700
Subject: [PATCH 179/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 49f73baaf..e6aec9556 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -464,7 +464,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         buf = new T[st];
         cudaMemcpy(buf, bart_encoder_emb_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
         printf("bart_encoder_emb_buf_\n");
-        for (int i=0; i<10; i++) {
+        for (int i=0; i<50; i++) {
             printf("%f ", double(buf[i]));
             if (i % 500 == 10 ) {
                 printf("\n");

From b7a56855fb60b6fcb11c2c0ba167cb07406a604f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 23:04:46 -0700
Subject: [PATCH 180/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index e6aec9556..4854832cf 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -470,6 +470,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
                 printf("\n");
             }
         }
+        printf("buf last: %f\n", buf[st-1]);
         printf("\n");
 }
         size_t  h_token_num;

From f94c0642ae5a3e1970c47343a4abd4c4ee91cc67 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 23:07:58 -0700
Subject: [PATCH 181/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 4854832cf..358460d18 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -466,11 +466,8 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         printf("bart_encoder_emb_buf_\n");
         for (int i=0; i<50; i++) {
             printf("%f ", double(buf[i]));
-            if (i % 500 == 10 ) {
-                printf("\n");
-            }
         }
-        printf("buf last: %f\n", buf[st-1]);
+        printf("buf last: %f\n", double(buf[st-1]));
         printf("\n");
 }
         size_t  h_token_num;

From 75f2e29e5a2b4b2aabffd2c0c3599043621dd4c9 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Wed, 20 Sep 2023 23:15:00 -0700
Subject: [PATCH 182/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 358460d18..95b1f939e 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -414,7 +414,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         size_t d_model_offset = id_offset * request_seq_len * d_model_;
 
         const int* sequence_lengths = input_tensors->at("sequence_length").getPtr<int>() + id_offset;
-
+        printf("use_inputs_embeds: %d\n", use_inputs_embeds);
         if (position_embedding_type == PositionEmbeddingType::absolute) {
             printf("invokeInputIdsEmbeddingLookupPosEncoding\n");
             invokeInputIdsEmbeddingLookupPosEncoding(

From e9343067da25765177797146fc959436640f2482 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 19:57:10 -0700
Subject: [PATCH 183/262] commit

---
 .../models/bart/BartEncoderWeight.cc              | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index a28485722..f948920f1 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -256,6 +256,21 @@ void BartEncoderWeight<T>::loadModel(std::string dir_path)
     FT_CHECK(is_maintain_buffer == true);
 
     loadWeightFromBin<T>(weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/encoder.embed_positions.weight.bin", model_file_type);
+{
+        T* buf;
+        int batch_size = 1;
+        int seq_len = 11;
+        int st = weights_size[0];
+        printf("weights_size: %d \n",weights_size[0]);
+        buf = new T[st];
+        cudaMemcpy(buf, weights_ptr[0], sizeof(T) * st, cudaMemcpyDeviceToHost);
+        printf("weights_ptr[0]\n");
+        for (int i=0; i<50; i++) {
+            printf("%f ", double(buf[i]));
+        }
+        printf("buf last: %f\n", double(buf[st-1]));
+        printf("\n");
+}
     loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/encoder.embed_tokens.weight.bin", model_file_type);
     loadWeightFromBin<T>(
         weights_ptr[2], {(size_t)weights_size[2]}, dir_path + "/encoder.final_layer_norm.weight.bin", model_file_type);

From 6006eae16e45a809c02d1e5f42c46e603cc8e94e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 19:58:12 -0700
Subject: [PATCH 184/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index c4e2e4c0e..8ef83c8e2 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -360,7 +360,7 @@ int main(int argc, char* argv[])
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> output_tensors_lists(
         (size_t)gpu_count);
     printf("[INFO] gpu_count: %d %d %d %d\n", gpu_count, model_instances.size(), request_list.size(), output_tensors_lists.size());
-    for (int i = 0; i < 2; i++) {
+    for (int i = 0; i < 1; i++) {
         threads.clear();
         for (int device_id = 0; device_id < gpu_count; device_id++) {
             threads.push_back(std::thread(threadForward,

From bca15e7e10ab929ac92f0a40ea81a95f4ea65a68 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 20:01:23 -0700
Subject: [PATCH 185/262] commit

---
 src/fastertransformer/models/bart/BartEncoderWeight.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartEncoderWeight.cc b/src/fastertransformer/models/bart/BartEncoderWeight.cc
index f948920f1..275f21435 100644
--- a/src/fastertransformer/models/bart/BartEncoderWeight.cc
+++ b/src/fastertransformer/models/bart/BartEncoderWeight.cc
@@ -256,14 +256,15 @@ void BartEncoderWeight<T>::loadModel(std::string dir_path)
     FT_CHECK(is_maintain_buffer == true);
 
     loadWeightFromBin<T>(weights_ptr[0], {(size_t)weights_size[0]}, dir_path + "/encoder.embed_positions.weight.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/encoder.embed_tokens.weight.bin", model_file_type);
 {
         T* buf;
         int batch_size = 1;
         int seq_len = 11;
-        int st = weights_size[0];
-        printf("weights_size: %d \n",weights_size[0]);
+        int st = weights_size[1];
+        printf("weights_size: %d \n",weights_size[1]);
         buf = new T[st];
-        cudaMemcpy(buf, weights_ptr[0], sizeof(T) * st, cudaMemcpyDeviceToHost);
+        cudaMemcpy(buf, weights_ptr[1], sizeof(T) * st, cudaMemcpyDeviceToHost);
         printf("weights_ptr[0]\n");
         for (int i=0; i<50; i++) {
             printf("%f ", double(buf[i]));
@@ -271,7 +272,6 @@ void BartEncoderWeight<T>::loadModel(std::string dir_path)
         printf("buf last: %f\n", double(buf[st-1]));
         printf("\n");
 }
-    loadWeightFromBin<T>(weights_ptr[1], {(size_t)weights_size[1]}, dir_path + "/encoder.embed_tokens.weight.bin", model_file_type);
     loadWeightFromBin<T>(
         weights_ptr[2], {(size_t)weights_size[2]}, dir_path + "/encoder.final_layer_norm.weight.bin", model_file_type);
     if (bart_with_bias) {

From 041cd8ec03a987309be0ed7b18d4cbd6bed932a1 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 20:22:23 -0700
Subject: [PATCH 186/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 95b1f939e..33a31ab95 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -464,6 +464,13 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         buf = new T[st];
         cudaMemcpy(buf, bart_encoder_emb_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
         printf("bart_encoder_emb_buf_\n");
+        for (int b=0; b<batch_size; b++) {
+            int tt = seq_len * d_model_;
+            for (int i=0; i<tt; i++) {
+                printf("%f ", buf[tt*batch_size+i]);
+            }
+            printf("\n");
+        }
         for (int i=0; i<50; i++) {
             printf("%f ", double(buf[i]));
         }

From 52124fdeb61c5e30173591821519c9bba7d67e3f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 20:22:53 -0700
Subject: [PATCH 187/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 33a31ab95..26ab7f7ad 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -467,7 +467,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         for (int b=0; b<batch_size; b++) {
             int tt = seq_len * d_model_;
             for (int i=0; i<tt; i++) {
-                printf("%f ", buf[tt*batch_size+i]);
+                printf("%f ", double(buf[tt*batch_size+i]));
             }
             printf("\n");
         }

From dce397f1de58076d34316611aadef93e5bfe80f1 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 20:23:08 -0700
Subject: [PATCH 188/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 26ab7f7ad..b8633f70c 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -467,7 +467,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         for (int b=0; b<batch_size; b++) {
             int tt = seq_len * d_model_;
             for (int i=0; i<tt; i++) {
-                printf("%f ", double(buf[tt*batch_size+i]));
+                printf("%f ", double(buf[tt*batch_size+b]));
             }
             printf("\n");
         }

From e5836a604e3054e19f17a66c62c8a581de72ce2e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 20:23:30 -0700
Subject: [PATCH 189/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index b8633f70c..681766bb5 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -467,7 +467,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         for (int b=0; b<batch_size; b++) {
             int tt = seq_len * d_model_;
             for (int i=0; i<tt; i++) {
-                printf("%f ", double(buf[tt*batch_size+b]));
+                printf("%f ", double(buf[i*batch_size+b]));
             }
             printf("\n");
         }

From 63a1f61d720608c378f6aebac9110ec9f80d6ad6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 20:23:56 -0700
Subject: [PATCH 190/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 681766bb5..9deed8563 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -468,6 +468,9 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
             int tt = seq_len * d_model_;
             for (int i=0; i<tt; i++) {
                 printf("%f ", double(buf[i*batch_size+b]));
+                if (tt == 0) {
+                    break;
+                }
             }
             printf("\n");
         }

From 46fb97d05ca2e70f8e7c19ae500cc15969905eec Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 20:24:27 -0700
Subject: [PATCH 191/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 9deed8563..94d6d230c 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -468,7 +468,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
             int tt = seq_len * d_model_;
             for (int i=0; i<tt; i++) {
                 printf("%f ", double(buf[i*batch_size+b]));
-                if (tt == 0) {
+                if (i > 10) {
                     break;
                 }
             }

From 84ca292401a3eb895184e9669caf615d3163992b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 20:27:10 -0700
Subject: [PATCH 192/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index 94d6d230c..da244e6a2 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -464,10 +464,9 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         buf = new T[st];
         cudaMemcpy(buf, bart_encoder_emb_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
         printf("bart_encoder_emb_buf_\n");
-        for (int b=0; b<batch_size; b++) {
-            int tt = seq_len * d_model_;
-            for (int i=0; i<tt; i++) {
-                printf("%f ", double(buf[i*batch_size+b]));
+        for (int i=0; i < seq_len; i++) {
+            for (int j=0; j<d_model_; j++) {
+                printf("%f ", double(buf[i+j*seq_len]));
                 if (i > 10) {
                     break;
                 }

From f3e0e34dbd179e890044585293976436aca6c31c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 20:27:16 -0700
Subject: [PATCH 193/262] commit

---
 src/fastertransformer/models/bart/BartEncoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartEncoder.cc b/src/fastertransformer/models/bart/BartEncoder.cc
index da244e6a2..0f6487b6a 100644
--- a/src/fastertransformer/models/bart/BartEncoder.cc
+++ b/src/fastertransformer/models/bart/BartEncoder.cc
@@ -467,7 +467,7 @@ void BartEncoder<T>::forward(TensorMap*                  output_tensors,
         for (int i=0; i < seq_len; i++) {
             for (int j=0; j<d_model_; j++) {
                 printf("%f ", double(buf[i+j*seq_len]));
-                if (i > 10) {
+                if (j > 10) {
                     break;
                 }
             }

From ada09770d6e8b1f0440acae3266db0eda059da7e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 20:57:07 -0700
Subject: [PATCH 194/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index e671b3408..9b76c9169 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -90,14 +90,14 @@ def split_and_convert_process(key, val, factor, saved_dir):
         else:
             prefix = "decoder"
         saved_path = saved_dir / f"{prefix}.embed_positions.weight.bin"
-        val[:, 2:].tofile(saved_path.as_posix())
+        val[:, 2:].T.tofile(saved_path.as_posix())
     elif key.find(".embed_tokens.weight") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"
         else:
             prefix = "decoder"
         saved_path = saved_dir / f"{prefix}.embed_tokens.weight.bin"
-        val.tofile(saved_path.as_posix())
+        val.T.tofile(saved_path.as_posix())
     elif key.find(".layernorm_embedding.weight") != -1:
         if key.find("encoder") != -1:
             prefix = "encoder"

From 9d3947add1765670b2b2444567f78ef7477424ab Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 21:25:04 -0700
Subject: [PATCH 195/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 6286f7b26..064279899 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -734,6 +734,18 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                      {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &tmp_local_batch_size}},
                      {"is_initialize_random_table", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_initialize_random_table}}});
 
+{
+                    T* buf;
+                    int st = batch_size * beam_width * vocab_size_padded_;
+                    buf = new T[st];
+                    cudaMemcpy(buf, logits_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
+                    printf("logits_buf_\n");
+                    for (int i=0; i<50; i++) {
+                        printf("%f ", double(buf[i]));
+                    }
+                    printf("buf last: %f\n", double(buf[st-1]));
+                    printf("\n");
+            }
                 if (cache_indirections_[src_indir_idx] != nullptr) {
                     dynamic_decode_input_tensors.insert(
                         "src_cache_indirection",

From edc41c224da638f1dc76acf1d8b4cfcbda213c30 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 21:40:29 -0700
Subject: [PATCH 196/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 064279899..5da1f75de 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -745,7 +745,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                     }
                     printf("buf last: %f\n", double(buf[st-1]));
                     printf("\n");
-            }
+}
                 if (cache_indirections_[src_indir_idx] != nullptr) {
                     dynamic_decode_input_tensors.insert(
                         "src_cache_indirection",

From cec4eca695e8c648c7561888655bd2ddfd172850 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 21:54:43 -0700
Subject: [PATCH 197/262] commit

---
 .../bart/utils/huggingface_bart_ckpt_convert.py   |  1 -
 src/fastertransformer/models/bart/BartDecoder.cc  | 15 ++++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 9b76c9169..6dd3c48c0 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -14,7 +14,6 @@
 
 import argparse
 import configparser
-import multiprocessing
 from datetime import datetime
 import logging
 from pathlib import Path
diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 2c7180549..c117df048 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -546,7 +546,20 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                                                        stream_);
         }
         sync_check_cuda_error();
-
+{
+    {
+                    T* buf;
+                    int st = local_batch_size * d_model_;
+                    buf = new T[st];
+                    cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
+                    printf("decoder_output\n");
+                    for (int i=0; i<50; i++) {
+                        printf("%f ", double(buf[i]));
+                    }
+                    printf("buf last: %f\n", double(buf[st-1]));
+                    printf("\n");
+}
+}
         if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
             && pipeline_para_.world_size_ > 1) {
             // ftNcclSend(decoder_output, local_batch_size * d_model_, pipeline_para_.rank_ + 1,

From 8e8e4afdea780ef8d7253e91df12d9de44ca088a Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 21:55:15 -0700
Subject: [PATCH 198/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index c117df048..46f3df3ad 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,7 +552,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    printf("decoder_output\n");
+                    printf("decoder_output at layer %d\n", l);
                     for (int i=0; i<50; i++) {
                         printf("%f ", double(buf[i]));
                     }

From 235ef50d4a036167f67fb900bd8f260e35f70eef Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 21:56:43 -0700
Subject: [PATCH 199/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 46f3df3ad..84a6c5f00 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,12 +552,15 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    printf("decoder_output at layer %d\n", l);
-                    for (int i=0; i<50; i++) {
-                        printf("%f ", double(buf[i]));
+                    if (input_tensors->at(4) == 0) {
+
+                        printf("decoder_output at layer %d step %d\n", l, input_tensors->at(4));
+                        for (int i=0; i<50; i++) {
+                            printf("%f ", double(buf[i]));
+                        }
+                        printf("buf last: %f\n", double(buf[st-1]));
+                        printf("\n");
                     }
-                    printf("buf last: %f\n", double(buf[st-1]));
-                    printf("\n");
 }
 }
         if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1

From 31f6f101d83bd49262a0d378e4ce3899e26a9d73 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 21:57:22 -0700
Subject: [PATCH 200/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 84a6c5f00..3848e67dd 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,9 +552,9 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    if (input_tensors->at(4) == 0) {
+                    if (input_tensors->at(4).data[0] == 0) {
 
-                        printf("decoder_output at layer %d step %d\n", l, input_tensors->at(4));
+                        printf("decoder_output at layer %d step %d\n", l, input_tensors->at(4).data[0]);
                         for (int i=0; i<50; i++) {
                             printf("%f ", double(buf[i]));
                         }

From 097f8c3017a84670fae460a444cdabdb8984828d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 21:58:13 -0700
Subject: [PATCH 201/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 3848e67dd..d61aa5df8 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,9 +552,10 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    if (input_tensors->at(4).data[0] == 0) {
+                    int * step = input_tensors->at(4).data;
+                    if ( *step == 0) {
 
-                        printf("decoder_output at layer %d step %d\n", l, input_tensors->at(4).data[0]);
+                        printf("decoder_output at layer %d step %d\n", l, *step);
                         for (int i=0; i<50; i++) {
                             printf("%f ", double(buf[i]));
                         }

From 094d637441127a310c2f78a5ffb13765106feeea Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 21:58:17 -0700
Subject: [PATCH 202/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index d61aa5df8..7a74443b8 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -554,7 +554,6 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
                     int * step = input_tensors->at(4).data;
                     if ( *step == 0) {
-
                         printf("decoder_output at layer %d step %d\n", l, *step);
                         for (int i=0; i<50; i++) {
                             printf("%f ", double(buf[i]));

From 0d11a2f41b1bc9dfeeb3ed52d52ffbc0711f83e9 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 21:58:33 -0700
Subject: [PATCH 203/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 7a74443b8..80da80d76 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,7 +552,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    int * step = input_tensors->at(4).data;
+                    const int * step = input_tensors->at(4).data;
                     if ( *step == 0) {
                         printf("decoder_output at layer %d step %d\n", l, *step);
                         for (int i=0; i<50; i++) {

From 6c0275a98647f51df817e5f064c62b3c232d65f3 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 21:59:25 -0700
Subject: [PATCH 204/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 80da80d76..68ffb0f46 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,9 +552,9 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    const int * step = input_tensors->at(4).data;
-                    if ( *step == 0) {
-                        printf("decoder_output at layer %d step %d\n", l, *step);
+                    const int step = input_tensors->at(4).getVal(0);
+                    if (step == 0) {
+                        printf("decoder_output at layer %d step %d\n", l, step);
                         for (int i=0; i<50; i++) {
                             printf("%f ", double(buf[i]));
                         }

From f30144281fc7d99321b16478526b765466fae9e7 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:00:59 -0700
Subject: [PATCH 205/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 68ffb0f46..23b787865 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,7 +552,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    const int step = input_tensors->at(4).getVal(0);
+                    const int step = input_tensors->at(4).getVal((size_t)0);
                     if (step == 0) {
                         printf("decoder_output at layer %d step %d\n", l, step);
                         for (int i=0; i<50; i++) {

From c5786f773fb6a13c7b3464bf04df08d5025484a2 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:01:15 -0700
Subject: [PATCH 206/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 23b787865..7e3248017 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,7 +552,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    const int step = input_tensors->at(4).getVal((size_t)0);
+                    const int step = input_tensors->at(4).getVal(size_t(0));
                     if (step == 0) {
                         printf("decoder_output at layer %d step %d\n", l, step);
                         for (int i=0; i<50; i++) {

From ab8d79670d7627bb9744b324f52d7e94146b01cd Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:01:34 -0700
Subject: [PATCH 207/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 7e3248017..9a8a19d03 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,7 +552,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    const int step = input_tensors->at(4).getVal(size_t(0));
+                    const T step = input_tensors->at(4).getVal(size_t(0));
                     if (step == 0) {
                         printf("decoder_output at layer %d step %d\n", l, step);
                         for (int i=0; i<50; i++) {

From be3b5a1eb0110f4712f1a6e28c3f147bf74f7f3b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:02:43 -0700
Subject: [PATCH 208/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 9a8a19d03..2213d9c39 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,7 +552,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    const T step = input_tensors->at(4).getVal(size_t(0));
+                    const T step = input_tensors->at(4).getVal();
                     if (step == 0) {
                         printf("decoder_output at layer %d step %d\n", l, step);
                         for (int i=0; i<50; i++) {

From 6841c8b83530d9082e006c7df22d7e0a4e4c151d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:03:00 -0700
Subject: [PATCH 209/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 2213d9c39..617bafd6c 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,7 +552,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    const T step = input_tensors->at(4).getVal();
+                    T step = input_tensors->at(4).getVal();
                     if (step == 0) {
                         printf("decoder_output at layer %d step %d\n", l, step);
                         for (int i=0; i<50; i++) {

From 66333b115500c4b8c1cd68c12033d998e7546d14 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:03:35 -0700
Subject: [PATCH 210/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 617bafd6c..440b65517 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,7 +552,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    T step = input_tensors->at(4).getVal();
+                    int step = input_tensors->at(4).getVal();
                     if (step == 0) {
                         printf("decoder_output at layer %d step %d\n", l, step);
                         for (int i=0; i<50; i++) {

From 573bdbc9462e4e85aa741fe6dc87c4fb1e40431d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:04:51 -0700
Subject: [PATCH 211/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 440b65517..5b7434e72 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -552,7 +552,8 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     int st = local_batch_size * d_model_;
                     buf = new T[st];
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    int step = input_tensors->at(4).getVal();
+                    auto step_ptr = input_tensors->at(4).data;
+                    int step = ((int*)step_ptr)[0];
                     if (step == 0) {
                         printf("decoder_output at layer %d step %d\n", l, step);
                         for (int i=0; i<50; i++) {

From c9baac99b4040d0cf5768573924e109dc86c0a1e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:06:21 -0700
Subject: [PATCH 212/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 5b7434e72..029857942 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -554,7 +554,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
                     auto step_ptr = input_tensors->at(4).data;
                     int step = ((int*)step_ptr)[0];
-                    if (step == 0) {
+                    if (true) {
                         printf("decoder_output at layer %d step %d\n", l, step);
                         for (int i=0; i<50; i++) {
                             printf("%f ", double(buf[i]));

From 285e55abd283be81d1efd88029d5eb02439f5137 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:06:53 -0700
Subject: [PATCH 213/262] commit

---
 src/fastertransformer/models/bart/BartDecoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 029857942..988f52339 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -554,7 +554,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
                     auto step_ptr = input_tensors->at(4).data;
                     int step = ((int*)step_ptr)[0];
-                    if (true) {
+                    if (step == 1) {
                         printf("decoder_output at layer %d step %d\n", l, step);
                         for (int i=0; i<50; i++) {
                             printf("%f ", double(buf[i]));

From da0d7fa68a13406ddd4e655c7a7b78a514ef9546 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:17:45 -0700
Subject: [PATCH 214/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 3 ++-
 src/fastertransformer/models/bart/BartDecoder.cc             | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 6dd3c48c0..d921b3dc0 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -14,6 +14,7 @@
 
 import argparse
 import configparser
+import multiprocessing
 from datetime import datetime
 import logging
 from pathlib import Path
@@ -68,7 +69,7 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
         qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"],
                          model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"],
                          model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"]], dim=-1)
-        qkv = qkv.reshape([shape[0], 3])
+        # qkv = qkv.reshape([shape[0], 3])
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
 
         split_vals = np.split(qkv, factor, axis=-1)
diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index 988f52339..e940ef159 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -562,7 +562,7 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                         printf("buf last: %f\n", double(buf[st-1]));
                         printf("\n");
                     }
-}
+    }
 }
         if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
             && pipeline_para_.world_size_ > 1) {

From 04e26426b3211435e1ac5d699bff3bebaa792d48 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:24:53 -0700
Subject: [PATCH 215/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 5da1f75de..56ee8447a 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -432,6 +432,18 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                              max_input_length - 1,
                              stream_);
     sync_check_cuda_error();
+    {
+        int* buf;
+        int st = batch_size;
+        buf = new T[st];
+        cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("logits_buf_\n");
+        for (int i=0; i<50; i++) {
+            printf("%f ", double(buf[i]));
+        }
+        printf("buf last: %f\n", double(buf[st-1]));
+        printf("\n");
+    }
 
     invokeBuildRelativeAttentionBias(relative_attention_bias_,
                                      decoding_weights->absolute_or_relative_position_embedding,

From 4698de70989cafa81b45254807ce0bcbf4b39883 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:25:00 -0700
Subject: [PATCH 216/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 56ee8447a..eb03d572d 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -437,7 +437,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         int st = batch_size;
         buf = new T[st];
         cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-        printf("logits_buf_\n");
+        printf("output_ids_buf_\n");
         for (int i=0; i<50; i++) {
             printf("%f ", double(buf[i]));
         }

From 94c31da0e487fec71f20039a6d2c81b23ff9b46b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:26:23 -0700
Subject: [PATCH 217/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index eb03d572d..ec31d7030 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -435,7 +435,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
     {
         int* buf;
         int st = batch_size;
-        buf = new T[st];
+        buf = new int[st];
         cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("output_ids_buf_\n");
         for (int i=0; i<50; i++) {

From 1699f149daf102800cdccebed7a8b2ca0420261d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:27:48 -0700
Subject: [PATCH 218/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index ec31d7030..54a4a8b72 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -436,8 +436,8 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         int* buf;
         int st = batch_size;
         buf = new int[st];
-        cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-        printf("output_ids_buf_\n");
+        cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("start_ids_buf_\n");
         for (int i=0; i<50; i++) {
             printf("%f ", double(buf[i]));
         }

From d98c2511d6b861c0161c488311eb677941cc24d0 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:30:15 -0700
Subject: [PATCH 219/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 54a4a8b72..f33a25cb3 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -382,6 +382,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         dynamic_decode_layer_->setup(batch_size, beam_width, &input_map);
         handleOptArg(&input_map, "start_id", start_ids_buf_, start_id_, batch_size);
         handleOptArg(&input_map, "end_id", end_ids_buf_, end_id_, batch_size);
+        printf("start_id_ end_id_ %d %d\n", start_id_, end_id_);
     }
 
     FT_CHECK_WITH_INFO(input_tensors->at("encoder_output").shape[2] == d_model_,

From 0bca9aa64804a5f1e1a0da06fcd0095780156c10 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:31:16 -0700
Subject: [PATCH 220/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index f33a25cb3..d66b550bb 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -439,10 +439,9 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         buf = new int[st];
         cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("start_ids_buf_\n");
-        for (int i=0; i<50; i++) {
-            printf("%f ", double(buf[i]));
+        for (int i=0; i<start_id_; i++) {
+            printf("%d ", buf[i]);
         }
-        printf("buf last: %f\n", double(buf[st-1]));
         printf("\n");
     }
 

From ab5327c43a084643f7aa20a0dc781dcdd99aee36 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Thu, 21 Sep 2023 22:31:59 -0700
Subject: [PATCH 221/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index d66b550bb..f1dce97cf 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -439,7 +439,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         buf = new int[st];
         cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("start_ids_buf_\n");
-        for (int i=0; i<start_id_; i++) {
+        for (int i=0; i<st; i++) {
             printf("%d ", buf[i]);
         }
         printf("\n");

From cbd4958e2c5932b3a1511ff6bc5c0b99a57bc06d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 14:58:45 -0700
Subject: [PATCH 222/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index f1dce97cf..fd2096b17 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -438,7 +438,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         int st = batch_size;
         buf = new int[st];
         cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-        printf("start_ids_buf_\n");
+        printf("start_ids_buf_ batch_size: %d\n", batch_size);
         for (int i=0; i<st; i++) {
             printf("%d ", buf[i]);
         }

From 23e97d41698c2b33b80028c3e6746714615467c3 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:00:29 -0700
Subject: [PATCH 223/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 8ef83c8e2..435edce16 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -138,10 +138,11 @@ broadCastRequest(const std::vector<int>& v_start_ids,
                 {"bad_words_list",
                  triton::Tensor{
                      triton::MEMORY_GPU, triton::TYPE_INT32, {2, v_input_bad_words.size() / 2}, d_input_bad_words}},
-                {"start_id",
-                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, start_ids_ptr}},
-                {"end_id",
-                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}}));
+                // {"start_id",
+                //  triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, start_ids_ptr}},
+                // {"end_id",
+                //  triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}
+                 }));
 
         int* beam_width_ptr = new int(param.beam_width);
         pointer_record->push_back(beam_width_ptr);

From 8444e4513254b971627d229161b8bd543dbe40ec Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:02:37 -0700
Subject: [PATCH 224/262] commit

---
 .../models/bart/BartDecoding.cc               | 68 ++++++++++++-------
 1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index fd2096b17..ade3e4a24 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -433,17 +433,17 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                              max_input_length - 1,
                              stream_);
     sync_check_cuda_error();
-    {
-        int* buf;
-        int st = batch_size;
-        buf = new int[st];
-        cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-        printf("start_ids_buf_ batch_size: %d\n", batch_size);
-        for (int i=0; i<st; i++) {
-            printf("%d ", buf[i]);
-        }
-        printf("\n");
-    }
+    // {
+    //     int* buf;
+    //     int st = batch_size;
+    //     buf = new int[st];
+    //     cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+    //     printf("start_ids_buf_ batch_size: %d\n", batch_size);
+    //     for (int i=0; i<st; i++) {
+    //         printf("%d ", buf[i]);
+    //     }
+    //     printf("\n");
+    // }
 
     invokeBuildRelativeAttentionBias(relative_attention_bias_,
                                      decoding_weights->absolute_or_relative_position_embedding,
@@ -746,18 +746,18 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                      {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &tmp_local_batch_size}},
                      {"is_initialize_random_table", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_initialize_random_table}}});
 
-{
-                    T* buf;
-                    int st = batch_size * beam_width * vocab_size_padded_;
-                    buf = new T[st];
-                    cudaMemcpy(buf, logits_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    printf("logits_buf_\n");
-                    for (int i=0; i<50; i++) {
-                        printf("%f ", double(buf[i]));
-                    }
-                    printf("buf last: %f\n", double(buf[st-1]));
-                    printf("\n");
-}
+// {
+//                     T* buf;
+//                     int st = batch_size * beam_width * vocab_size_padded_;
+//                     buf = new T[st];
+//                     cudaMemcpy(buf, logits_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
+//                     printf("logits_buf_\n");
+//                     for (int i=0; i<50; i++) {
+//                         printf("%f ", double(buf[i]));
+//                     }
+//                     printf("buf last: %f\n", double(buf[st-1]));
+//                     printf("\n");
+// }
                 if (cache_indirections_[src_indir_idx] != nullptr) {
                     dynamic_decode_input_tensors.insert(
                         "src_cache_indirection",
@@ -815,8 +815,30 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                     }
                     dynamic_decode_output_tensors.insert(*t);
                 }
+    {
+        int* buf;
+        int st = batch_size;
+        buf = new int[st];
+        cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("start_ids_buf_ before forward: %d\n", batch_size);
+        for (int i=0; i<st; i++) {
+            printf("%d ", buf[i]);
+        }
+        printf("\n");
+    }
 
                 dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+    {
+        int* buf;
+        int st = batch_size;
+        buf = new int[st];
+        cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("start_ids_buf_ after forward: %d\n", batch_size);
+        for (int i=0; i<st; i++) {
+            printf("%d ", buf[i]);
+        }
+        printf("\n");
+    }
             }
         }
 

From e331161d35c058f9e221c5cbef45033f0ceb2bc9 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:04:06 -0700
Subject: [PATCH 225/262] commit

---
 .../models/bart/BartDecoding.cc               | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index ade3e4a24..67105a9bf 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -433,17 +433,17 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                              max_input_length - 1,
                              stream_);
     sync_check_cuda_error();
-    // {
-    //     int* buf;
-    //     int st = batch_size;
-    //     buf = new int[st];
-    //     cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-    //     printf("start_ids_buf_ batch_size: %d\n", batch_size);
-    //     for (int i=0; i<st; i++) {
-    //         printf("%d ", buf[i]);
-    //     }
-    //     printf("\n");
-    // }
+    {
+        int* buf;
+        int st = batch_size;
+        buf = new int[st];
+        cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("output_ids_buf_ batch_size: %d\n", batch_size);
+        for (int i=0; i<st; i++) {
+            printf("%d ", buf[i]);
+        }
+        printf("\n");
+    }
 
     invokeBuildRelativeAttentionBias(relative_attention_bias_,
                                      decoding_weights->absolute_or_relative_position_embedding,

From c86a3845574628c6a09647a44f91bb6ce03199eb Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:04:37 -0700
Subject: [PATCH 226/262] commit

---
 .../models/bart/BartDecoding.cc               | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 67105a9bf..ce66d1187 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -815,30 +815,30 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                     }
                     dynamic_decode_output_tensors.insert(*t);
                 }
-    {
-        int* buf;
-        int st = batch_size;
-        buf = new int[st];
-        cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-        printf("start_ids_buf_ before forward: %d\n", batch_size);
-        for (int i=0; i<st; i++) {
-            printf("%d ", buf[i]);
-        }
-        printf("\n");
-    }
+    // {
+    //     int* buf;
+    //     int st = batch_size;
+    //     buf = new int[st];
+    //     cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+    //     printf("start_ids_buf_ before forward: %d\n", batch_size);
+    //     for (int i=0; i<st; i++) {
+    //         printf("%d ", buf[i]);
+    //     }
+    //     printf("\n");
+    // }
 
                 dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
-    {
-        int* buf;
-        int st = batch_size;
-        buf = new int[st];
-        cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-        printf("start_ids_buf_ after forward: %d\n", batch_size);
-        for (int i=0; i<st; i++) {
-            printf("%d ", buf[i]);
-        }
-        printf("\n");
-    }
+    // {
+    //     int* buf;
+    //     int st = batch_size;
+    //     buf = new int[st];
+    //     cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+    //     printf("start_ids_buf_ after forward: %d\n", batch_size);
+    //     for (int i=0; i<st; i++) {
+    //         printf("%d ", buf[i]);
+    //     }
+    //     printf("\n");
+    // }
             }
         }
 

From 29690574011511e0d904e8be6c823afc99fc66f3 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:11:01 -0700
Subject: [PATCH 227/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 38 ++++++++++++------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 435edce16..22b1af2ec 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -384,7 +384,7 @@ int main(int argc, char* argv[])
     const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
     const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
     const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
-    const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("sequence_length").data;
+    // const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("sequence_length").data;
     // step 6: check results
     if (node_id == 0) {
 
@@ -404,10 +404,10 @@ int main(int argc, char* argv[])
             {
                 std::cout << "Writing " << outCount << " elements\n";
                 int zeroCount = 0;
-                for (int i=0; i<batch_size; i++) {
-                    printf("%d ", iBuf[i]);
-                }
-                printf("\n");
+                // for (int i=0; i<batch_size; i++) {
+                //     printf("%d ", iBuf[i]);
+                // }
+                // printf("\n");
                 for (size_t i = 0; i < outCount; i++) {
                     if (hBuf[i] == int(0))
                         zeroCount++;
@@ -435,20 +435,20 @@ int main(int argc, char* argv[])
     cudaDeviceSynchronize();
     gettimeofday(&start, NULL);
 
-    const int ite = 1;
-    for (int i = 0; i < ite; i++) {
-        threads.clear();
-        for (int device_id = 0; device_id < gpu_count; device_id++) {
-            threads.push_back(std::thread(threadForward,
-                                          &model_instances[device_id],
-                                          request_list[device_id],
-                                          &output_tensors_lists[device_id],
-                                          device_id));
-        }
-        for (auto& t : threads) {
-            t.join();
-        }
-    }
+    // const int ite = 1;
+    // for (int i = 0; i < ite; i++) {
+    //     threads.clear();
+    //     for (int device_id = 0; device_id < gpu_count; device_id++) {
+    //         threads.push_back(std::thread(threadForward,
+    //                                       &model_instances[device_id],
+    //                                       request_list[device_id],
+    //                                       &output_tensors_lists[device_id],
+    //                                       device_id));
+    //     }
+    //     for (auto& t : threads) {
+    //         t.join();
+    //     }
+    // }
 
     cudaDeviceSynchronize();
     ft::mpi::barrier();

From 7c26c1a0cfadc1836bc47f779df53291e31dc33a Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:11:19 -0700
Subject: [PATCH 228/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 22b1af2ec..0ad13c02b 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -396,9 +396,9 @@ int main(int argc, char* argv[])
         else {
             size_t outCount = batch_size * beam_width * seq_len;
             int*   hBuf     = new int[outCount];
-            int*   iBuf     = new int[batch_size];
+            // int*   iBuf     = new int[batch_size];
             ft::cudaD2Hcpy(hBuf, d_output_ids, outCount);
-            ft::cudaD2Hcpy(iBuf, d_input_lengths, batch_size);
+            // ft::cudaD2Hcpy(iBuf, d_input_lengths, batch_size);
             
 
             {

From b222d500de453cf43d962335e11ae731a642f126 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:11:40 -0700
Subject: [PATCH 229/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index 0ad13c02b..e02534dde 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -450,17 +450,17 @@ int main(int argc, char* argv[])
     //     }
     // }
 
-    cudaDeviceSynchronize();
-    ft::mpi::barrier();
+    // cudaDeviceSynchronize();
+    // ft::mpi::barrier();
 
-    gettimeofday(&end, NULL);
+    // gettimeofday(&end, NULL);
 
-    printf("[INFO] batch_size %d beam_width %d seq_len %d"
-           " FT-CPP-GPT-Triton-time %.2f ms\n",
-           batch_size,
-           beam_width,
-           seq_len,
-           ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
+    // printf("[INFO] batch_size %d beam_width %d seq_len %d"
+    //        " FT-CPP-GPT-Triton-time %.2f ms\n",
+    //        batch_size,
+    //        beam_width,
+    //        seq_len,
+    //        ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
 
     ft::mpi::finalize();
     return 0;

From cb5d9365fc699296b010d7331c06cc9f64c60100 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:15:11 -0700
Subject: [PATCH 230/262] commit

---
 .../models/bart/BartDecoder.cc                | 36 +++++++--------
 .../models/bart/BartDecoding.cc               | 46 +++++++++----------
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoder.cc b/src/fastertransformer/models/bart/BartDecoder.cc
index e940ef159..d8c928e86 100644
--- a/src/fastertransformer/models/bart/BartDecoder.cc
+++ b/src/fastertransformer/models/bart/BartDecoder.cc
@@ -546,24 +546,24 @@ void BartDecoder<T>::forward(std::vector<Tensor>*                           outp
                                                        stream_);
         }
         sync_check_cuda_error();
-{
-    {
-                    T* buf;
-                    int st = local_batch_size * d_model_;
-                    buf = new T[st];
-                    cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
-                    auto step_ptr = input_tensors->at(4).data;
-                    int step = ((int*)step_ptr)[0];
-                    if (step == 1) {
-                        printf("decoder_output at layer %d step %d\n", l, step);
-                        for (int i=0; i<50; i++) {
-                            printf("%f ", double(buf[i]));
-                        }
-                        printf("buf last: %f\n", double(buf[st-1]));
-                        printf("\n");
-                    }
-    }
-}
+// {
+//     {
+//                     T* buf;
+//                     int st = local_batch_size * d_model_;
+//                     buf = new T[st];
+//                     cudaMemcpy(buf, decoder_output, sizeof(T) * st, cudaMemcpyDeviceToHost);
+//                     auto step_ptr = input_tensors->at(4).data;
+//                     int step = ((int*)step_ptr)[0];
+//                     if (step == 1) {
+//                         printf("decoder_output at layer %d step %d\n", l, step);
+//                         for (int i=0; i<50; i++) {
+//                             printf("%f ", double(buf[i]));
+//                         }
+//                         printf("buf last: %f\n", double(buf[st-1]));
+//                         printf("\n");
+//                     }
+//     }
+// }
         if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
             && pipeline_para_.world_size_ > 1) {
             // ftNcclSend(decoder_output, local_batch_size * d_model_, pipeline_para_.rank_ + 1,
diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index ce66d1187..1461b7cfe 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -435,7 +435,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
     sync_check_cuda_error();
     {
         int* buf;
-        int st = batch_size;
+        int st = batch_size * (max_seq_len+1);
         buf = new int[st];
         cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("output_ids_buf_ batch_size: %d\n", batch_size);
@@ -815,30 +815,30 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                     }
                     dynamic_decode_output_tensors.insert(*t);
                 }
-    // {
-    //     int* buf;
-    //     int st = batch_size;
-    //     buf = new int[st];
-    //     cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-    //     printf("start_ids_buf_ before forward: %d\n", batch_size);
-    //     for (int i=0; i<st; i++) {
-    //         printf("%d ", buf[i]);
-    //     }
-    //     printf("\n");
-    // }
+    {
+        int* buf;
+        int st = batch_size * (max_seq_len+1);
+        buf = new int[st];
+        cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("start_ids_buf_ before forward: %d\n", batch_size);
+        for (int i=0; i<st; i++) {
+            printf("%d ", buf[i]);
+        }
+        printf("\n");
+    }
 
                 dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
-    // {
-    //     int* buf;
-    //     int st = batch_size;
-    //     buf = new int[st];
-    //     cudaMemcpy(buf, start_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-    //     printf("start_ids_buf_ after forward: %d\n", batch_size);
-    //     for (int i=0; i<st; i++) {
-    //         printf("%d ", buf[i]);
-    //     }
-    //     printf("\n");
-    // }
+    {
+        int* buf;
+        int st = batch_size * (max_seq_len+1);
+        buf = new int[st];
+        cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("start_ids_buf_ after forward: %d\n", batch_size);
+        for (int i=0; i<st; i++) {
+            printf("%d ", buf[i]);
+        }
+        printf("\n");
+    }
             }
         }
 

From 52e3e3e59b64450d55ecd5d345b630a2a8a6ac01 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:24:36 -0700
Subject: [PATCH 231/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 1ae9fffb1..0c2593f29 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -203,6 +203,20 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
 
         bart_decoding_->forward(&decoding_output_tensors, &decoding_input_tensors, bart_decoding_weight_.get());
 
+{
+        int* buf;
+        int st = request_batch_size * 32;
+        buf = new int[st];
+        cudaMemcpy(buf, d_output_ids_, sizeof(T) * st, cudaMemcpyDeviceToHost);
+        printf("cudaMemcpy d_output_ids_\n");
+        for (int i=0; i<10; i++) {
+            printf("%d ", (buf[i]));
+            if (i % 500 == 10 ) {
+                printf("\n");
+            }
+        }
+        printf("\n");
+}
         if (stream_cb_ != nullptr) {
             bart_decoding_->unRegisterCallback();
         }

From 1a6ae8eb697d0f219682191500df4216af34dd00 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:28:38 -0700
Subject: [PATCH 232/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 0c2593f29..3d57f8920 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -207,7 +207,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
         int* buf;
         int st = request_batch_size * 32;
         buf = new int[st];
-        cudaMemcpy(buf, d_output_ids_, sizeof(T) * st, cudaMemcpyDeviceToHost);
+        cudaMemcpy(buf, d_output_ids_, sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("cudaMemcpy d_output_ids_\n");
         for (int i=0; i<10; i++) {
             printf("%d ", (buf[i]));

From 03cd96db0e5996696ab1429a9ec57c4ad2211a23 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 15:29:22 -0700
Subject: [PATCH 233/262] commit

---
 .../triton_backend/bart/BartTritonModelInstance.cc              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
index 3d57f8920..2c8add9b6 100644
--- a/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/bart/BartTritonModelInstance.cc
@@ -205,7 +205,7 @@ BartTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::stri
 
 {
         int* buf;
-        int st = request_batch_size * 32;
+        int st = request_batch_size * max_output_len;
         buf = new int[st];
         cudaMemcpy(buf, d_output_ids_, sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("cudaMemcpy d_output_ids_\n");

From a5aac95e0bd3189c2abc09d93cb2af6f48c24a83 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:11:35 -0700
Subject: [PATCH 234/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 1461b7cfe..71a84dcf5 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -524,6 +524,17 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                 sync_check_cuda_error();
             }
 
+    if (step == max_input_length) {
+        T* buf;
+        int st = batch_size * d_model_;
+        buf = new T[st];
+        cudaMemcpy(buf, decoder_input_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
+        printf("decoder_input_buf_: %d\n", batch_size);
+        for (int i=0; i<st; i++) {
+            printf("%d ", buf[i]);
+        }
+        printf("\n");
+    }
             // BART/mBART has a layernorm after word + positional embedding
             invokeGeneralT5LayerNorm(decoder_input_buf_ + d_model_offset,
                                      decoder_input_buf_ + d_model_offset,

From 721e2b1611961137c5ef9e918ac3e5012347589e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:13:28 -0700
Subject: [PATCH 235/262] commit

---
 .../models/bart/BartDecoding.cc               | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 71a84dcf5..68f9f3a91 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -826,30 +826,30 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
                     }
                     dynamic_decode_output_tensors.insert(*t);
                 }
-    {
-        int* buf;
-        int st = batch_size * (max_seq_len+1);
-        buf = new int[st];
-        cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-        printf("start_ids_buf_ before forward: %d\n", batch_size);
-        for (int i=0; i<st; i++) {
-            printf("%d ", buf[i]);
-        }
-        printf("\n");
-    }
+    // {
+    //     int* buf;
+    //     int st = batch_size * (max_seq_len+1);
+    //     buf = new int[st];
+    //     cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+    //     printf("start_ids_buf_ before forward: %d\n", batch_size);
+    //     for (int i=0; i<st; i++) {
+    //         printf("%d ", buf[i]);
+    //     }
+    //     printf("\n");
+    // }
 
                 dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
-    {
-        int* buf;
-        int st = batch_size * (max_seq_len+1);
-        buf = new int[st];
-        cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-        printf("start_ids_buf_ after forward: %d\n", batch_size);
-        for (int i=0; i<st; i++) {
-            printf("%d ", buf[i]);
-        }
-        printf("\n");
-    }
+    // {
+    //     int* buf;
+    //     int st = batch_size * (max_seq_len+1);
+    //     buf = new int[st];
+    //     cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+    //     printf("start_ids_buf_ after forward: %d\n", batch_size);
+    //     for (int i=0; i<st; i++) {
+    //         printf("%d ", buf[i]);
+    //     }
+    //     printf("\n");
+    // }
             }
         }
 

From ef0994f507a8487aeefcaf2531aeb5128a1467b8 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:13:58 -0700
Subject: [PATCH 236/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 68f9f3a91..8b435c3bd 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -531,7 +531,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         cudaMemcpy(buf, decoder_input_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
         printf("decoder_input_buf_: %d\n", batch_size);
         for (int i=0; i<st; i++) {
-            printf("%d ", buf[i]);
+            printf("%f ", double(buf[i]));
         }
         printf("\n");
     }

From 177935c2f7d107783119b89f4b8fd19244d73220 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:27:57 -0700
Subject: [PATCH 237/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index d921b3dc0..27877420a 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -268,7 +268,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
         val.tofile(saved_path.as_posix())
     elif key.find("lm_head.weight") != -1:
         saved_path = saved_dir / "decoder.lm_head.weight.bin"
-        val.tofile(saved_path.as_posix())
+        val.T.tofile(saved_path.as_posix())
     elif key.find("final_logits_bias") != -1:
         saved_path = saved_dir / "decoder.final_logits_bias.bin"
         val.tofile(saved_path.as_posix())

From e4cfa3c8df250237f19232b56d5b5b23df857f46 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:29:46 -0700
Subject: [PATCH 238/262] commit

---
 .../bart/utils/huggingface_bart_ckpt_convert.py  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 27877420a..3bd186722 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -316,15 +316,15 @@ def convert_checkpoint(args):
     np_weight_data_type = get_weight_data_type(args.weight_data_type)
 
     i_gpu_num = args.inference_tensor_para_size
-    for name, param in bart_model.state_dict().items():
-        split_and_convert_process(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
-    # pool = multiprocessing.Pool(args.processes)
-    # pool.starmap_async(split_and_convert_process,
-    #                    [(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
-    #                     for name, param in bart_model.state_dict().items()])
+    # for name, param in bart_model.state_dict().items():
+    #     split_and_convert_process(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
+    pool = multiprocessing.Pool(args.processes)
+    pool.starmap_async(split_and_convert_process,
+                       [(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
+                        for name, param in bart_model.state_dict().items()])
 
-    # pool.close()
-    # pool.join()
+    pool.close()
+    pool.join()
 
     fuse_decoder_qkv(bart_model, i_gpu_num, saved_dir, np_weight_data_type)
 

From e14226e6845232d6c4675e8a284d62695af7273c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:34:18 -0700
Subject: [PATCH 239/262] commit

---
 .../models/bart/BartDecoding.cc               | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 8b435c3bd..6366106e7 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -839,17 +839,17 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
     // }
 
                 dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
-    // {
-    //     int* buf;
-    //     int st = batch_size * (max_seq_len+1);
-    //     buf = new int[st];
-    //     cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-    //     printf("start_ids_buf_ after forward: %d\n", batch_size);
-    //     for (int i=0; i<st; i++) {
-    //         printf("%d ", buf[i]);
-    //     }
-    //     printf("\n");
-    // }
+    {
+        int* buf;
+        int st = batch_size * (max_seq_len+1);
+        buf = new int[st];
+        cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("output_ids_buf_ after forward: %d\n", batch_size);
+        for (int i=0; i<st; i++) {
+            printf("%d ", buf[i]);
+        }
+        printf("\n");
+    }
             }
         }
 

From 5087376b2709ad724cd80ae83c2fa9edcf86810c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:38:54 -0700
Subject: [PATCH 240/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 6366106e7..ddc0c2b05 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -978,6 +978,18 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         }
     }
 
+    {
+        int* buf;
+        int st = batch_size * (max_seq_len+1);
+        buf = new int[st];
+        cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("output_ids_buf_ after finalize: %d\n", batch_size);
+        for (int i=0; i<st; i++) {
+            printf("%d ", buf[i]);
+        }
+        printf("\n");
+    }
+
     if (pipeline_para_.world_size_ > 1) {
         ftNcclGroupStart();
         if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {

From b3491bfe2890f16b610967eab6ea7e261e2902c3 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:45:15 -0700
Subject: [PATCH 241/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index ddc0c2b05..7a74e68a0 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -988,6 +988,19 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
             printf("%d ", buf[i]);
         }
         printf("\n");
+        
+    }
+
+    {
+        int* buf;
+        int st = batch_size * (max_seq_len+1);
+        buf = new int[st];
+        cudaMemcpy(buf, output_tensors->at("output_ids").getPtr<int>(), sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("output_ids after finalize: %d\n", batch_size);
+        for (int i=0; i<st; i++) {
+            printf("%d ", buf[i]);
+        }
+        printf("\n");
     }
 
     if (pipeline_para_.world_size_ > 1) {

From d23cefea7504ace2a0e20e959371a7bbf25fc2da Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:47:07 -0700
Subject: [PATCH 242/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 7a74e68a0..de3c48e67 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -994,8 +994,8 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
     {
         int* buf;
         int st = batch_size * (max_seq_len+1);
-        buf = new int[st];
-        cudaMemcpy(buf, output_tensors->at("output_ids").getPtr<int>(), sizeof(int) * st, cudaMemcpyDeviceToHost);
+        buf = output_tensors->at("output_ids").getPtr<int>();
+        //cudaMemcpy(buf, output_tensors->at("output_ids").getPtr<int>(), sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("output_ids after finalize: %d\n", batch_size);
         for (int i=0; i<st; i++) {
             printf("%d ", buf[i]);

From a95aff4332483e8acacf0b6915edf80e9fc5b7b9 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:49:19 -0700
Subject: [PATCH 243/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index de3c48e67..7a74e68a0 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -994,8 +994,8 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
     {
         int* buf;
         int st = batch_size * (max_seq_len+1);
-        buf = output_tensors->at("output_ids").getPtr<int>();
-        //cudaMemcpy(buf, output_tensors->at("output_ids").getPtr<int>(), sizeof(int) * st, cudaMemcpyDeviceToHost);
+        buf = new int[st];
+        cudaMemcpy(buf, output_tensors->at("output_ids").getPtr<int>(), sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("output_ids after finalize: %d\n", batch_size);
         for (int i=0; i<st; i++) {
             printf("%d ", buf[i]);

From 53c4b28420575623a2bacce3f2b3fb672fbe785c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:52:59 -0700
Subject: [PATCH 244/262] commit

---
 .../models/bart/BartDecoding.cc               | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 7a74e68a0..9774655dc 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -991,18 +991,6 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         
     }
 
-    {
-        int* buf;
-        int st = batch_size * (max_seq_len+1);
-        buf = new int[st];
-        cudaMemcpy(buf, output_tensors->at("output_ids").getPtr<int>(), sizeof(int) * st, cudaMemcpyDeviceToHost);
-        printf("output_ids after finalize: %d\n", batch_size);
-        for (int i=0; i<st; i++) {
-            printf("%d ", buf[i]);
-        }
-        printf("\n");
-    }
-
     if (pipeline_para_.world_size_ > 1) {
         ftNcclGroupStart();
         if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
@@ -1069,6 +1057,17 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
     // throw errors when detected
     ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
 
+    {
+        int* buf;
+        int st = batch_size * (max_seq_len+1);
+        buf = new int[st];
+        cudaMemcpy(buf, output_tensors->at("output_ids").getPtr<int>(), sizeof(int) * st, cudaMemcpyDeviceToHost);
+        printf("output_ids after finalize: %d\n", batch_size);
+        for (int i=0; i<st; i++) {
+            printf("%d ", buf[i]);
+        }
+        printf("\n");
+    }
     if (is_free_buffer_after_forward_) {
         freeBuffer();
     }

From 82d1388e076369014961cba245254015dd440f67 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 16:57:08 -0700
Subject: [PATCH 245/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 9774655dc..7770171a8 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -1062,7 +1062,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         int st = batch_size * (max_seq_len+1);
         buf = new int[st];
         cudaMemcpy(buf, output_tensors->at("output_ids").getPtr<int>(), sizeof(int) * st, cudaMemcpyDeviceToHost);
-        printf("output_ids after finalize: %d\n", batch_size);
+        printf("output_ids after finalize: %s %d\n", output_tensors->at("output_ids").toString().c_str(), batch_size);
         for (int i=0; i<st; i++) {
             printf("%d ", buf[i]);
         }

From d056e3bec2740a02c8bf31a387fea1569c2f5a4b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 17:00:26 -0700
Subject: [PATCH 246/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 7770171a8..819069544 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -1061,7 +1061,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         int* buf;
         int st = batch_size * (max_seq_len+1);
         buf = new int[st];
-        cudaMemcpy(buf, output_tensors->at("output_ids").getPtr<int>(), sizeof(int) * st, cudaMemcpyDeviceToHost);
+        cudaMemcpy(buf, output_tensors->at("output_ids").data, sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("output_ids after finalize: %s %d\n", output_tensors->at("output_ids").toString().c_str(), batch_size);
         for (int i=0; i<st; i++) {
             printf("%d ", buf[i]);

From 17af137678a2a30f9166cb0aa7b41525c7fb00e8 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 17:02:38 -0700
Subject: [PATCH 247/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 819069544..70c39a022 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -980,7 +980,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
 
     {
         int* buf;
-        int st = batch_size * (max_seq_len+1);
+        int st = 32;
         buf = new int[st];
         cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("output_ids_buf_ after finalize: %d\n", batch_size);

From 910169d53aff3fc6334c937af1888763a30499ae Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 17:03:29 -0700
Subject: [PATCH 248/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 70c39a022..08457c14f 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -980,7 +980,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
 
     {
         int* buf;
-        int st = 32;
+        int st = batch_size * (max_seq_len+1);
         buf = new int[st];
         cudaMemcpy(buf, output_ids_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("output_ids_buf_ after finalize: %d\n", batch_size);
@@ -1059,7 +1059,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
 
     {
         int* buf;
-        int st = batch_size * (max_seq_len+1);
+        int st = 32;
         buf = new int[st];
         cudaMemcpy(buf, output_tensors->at("output_ids").data, sizeof(int) * st, cudaMemcpyDeviceToHost);
         printf("output_ids after finalize: %s %d\n", output_tensors->at("output_ids").toString().c_str(), batch_size);

From f06b30f72f875094010201916ee755389ac85feb Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 17:04:21 -0700
Subject: [PATCH 249/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 08457c14f..1f1b8b358 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -944,7 +944,8 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         else {
             // For sampling, only transpose the results to output_tensor
             invokeTransposeAxis01(output_tensors->at("output_ids").getPtr<int>(),
-                                  output_ids_buf_ + batch_size * beam_width,
+                                  // output_ids_buf_ + batch_size * beam_width,
+                                  output_ids_buf_,
                                   max_seq_len,
                                   batch_size * beam_width,
                                   1,

From 5996f5c8a46388f1d77b734b81d7362caba7b52e Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 17:05:09 -0700
Subject: [PATCH 250/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.cc b/src/fastertransformer/models/bart/BartDecoding.cc
index 1f1b8b358..08457c14f 100644
--- a/src/fastertransformer/models/bart/BartDecoding.cc
+++ b/src/fastertransformer/models/bart/BartDecoding.cc
@@ -944,8 +944,7 @@ void BartDecoding<T>::forward(TensorMap*                   output_tensors,
         else {
             // For sampling, only transpose the results to output_tensor
             invokeTransposeAxis01(output_tensors->at("output_ids").getPtr<int>(),
-                                  // output_ids_buf_ + batch_size * beam_width,
-                                  output_ids_buf_,
+                                  output_ids_buf_ + batch_size * beam_width,
                                   max_seq_len,
                                   batch_size * beam_width,
                                   1,

From 4a4a6733f95c03b95316d5aab1193fbb3e136306 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Fri, 22 Sep 2023 17:14:55 -0700
Subject: [PATCH 251/262] commit

---
 examples/cpp/bart/bart_triton_example.cc | 2 +-
 examples/cpp/bart/start_ids.csv          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/bart/bart_triton_example.cc b/examples/cpp/bart/bart_triton_example.cc
index e02534dde..448167273 100644
--- a/examples/cpp/bart/bart_triton_example.cc
+++ b/examples/cpp/bart/bart_triton_example.cc
@@ -73,7 +73,7 @@ broadCastRequest(const std::vector<int>& v_start_ids,
     }
     ft::mpi::barrier();
 
-    int request_batch_size = 1;
+    int request_batch_size = 2;
     int max_input_len      = size_1 / size_2;
 
     ft::mpi::bcast(v_input_ids.data(), size_1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
diff --git a/examples/cpp/bart/start_ids.csv b/examples/cpp/bart/start_ids.csv
index 8a78f4e99..0fb3afb4a 100644
--- a/examples/cpp/bart/start_ids.csv
+++ b/examples/cpp/bart/start_ids.csv
@@ -1 +1,2 @@
 0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2
+0, 4154, 1231, 15674, 345, 1534, 440, 50264, 11, 1854, 2

From ef2b4aaa83e32cb928949cf75dfac97b745fe1d2 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sat, 23 Sep 2023 21:12:48 -0700
Subject: [PATCH 252/262] commit

---
 .../pytorch/bart/utils/huggingface_bart_ckpt_convert.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 3bd186722..c2357a665 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -69,7 +69,6 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
         qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"],
                          model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"],
                          model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"]], dim=-1)
-        # qkv = qkv.reshape([shape[0], 3])
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
 
         split_vals = np.split(qkv, factor, axis=-1)
@@ -78,11 +77,12 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
             split_vals[j].tofile(saved_path.as_posix())
 
 
+def get_encoder_or_decoder(key):
+    return "encoder" if key.find("encoder") != -1 else "decoder"
+
 def split_and_convert_process(key, val, factor, saved_dir):
     if val.ndim == 2:
         val = val.transpose(1, 0)
-    # LOGGER.debug(f"key: {key}, val.shape: {val.shape}")
-    print(f"key: {key}, val.shape: {val.shape}")
 
     if key.find(".embed_positions.weight") != -1:
         if key.find("encoder") != -1:

From 81f8437e971b7fd93619441fe937f9c8555d4e5d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sat, 23 Sep 2023 21:19:18 -0700
Subject: [PATCH 253/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 74 +++++--------------
 1 file changed, 18 insertions(+), 56 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index c2357a665..c637cea28 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -85,31 +85,19 @@ def split_and_convert_process(key, val, factor, saved_dir):
         val = val.transpose(1, 0)
 
     if key.find(".embed_positions.weight") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         saved_path = saved_dir / f"{prefix}.embed_positions.weight.bin"
         val[:, 2:].T.tofile(saved_path.as_posix())
     elif key.find(".embed_tokens.weight") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         saved_path = saved_dir / f"{prefix}.embed_tokens.weight.bin"
         val.T.tofile(saved_path.as_posix())
     elif key.find(".layernorm_embedding.weight") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         saved_path = saved_dir / f"{prefix}.final_layer_norm.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find(".layernorm_embedding.bias") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         saved_path = saved_dir / f"{prefix}.final_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
     elif (
@@ -118,10 +106,9 @@ def split_and_convert_process(key, val, factor, saved_dir):
         or key.find("self_attn.q_proj.weight") != -1
     ):
         split_vals = np.split(val, factor, axis=0)
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
+        if prefix == "decoder":
+            # will be handled in fuse_decoder_qkv instead
             return
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         qkv = key.split('self_attn.')[1][:1]
@@ -134,10 +121,9 @@ def split_and_convert_process(key, val, factor, saved_dir):
         or key.find("self_attn.q_proj.bias") != -1
     ):
         split_vals = np.split(val, factor, axis=0)
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
+        if prefix == "decoder":
+            # will be handled in fuse_decoder_qkv instead
             return
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         qkv = key.split('self_attn.')[1][:1]
@@ -146,37 +132,25 @@ def split_and_convert_process(key, val, factor, saved_dir):
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("self_attn.out_proj.weight") != -1:
         split_vals = np.split(val, factor, axis=0)
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         for j in range(factor):
             saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.out_proj.weight.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("self_attn.out_proj.bias") != -1:
         split_vals = np.split(val, factor, axis=0)
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         for j in range(factor):
             saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.out_proj.bias.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("self_attn_layer_norm.weight") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("self_attn_layer_norm.bias") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         layer = int(key.split('layers.')[1].split('.self_attn')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.attn_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
@@ -223,10 +197,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
         saved_path = saved_dir / f"decoder.{layer}.layer.CrossAttention.attn_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("fc1.weight") != -1 or key.find("fc2.weight") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         split_vals = np.split(val, factor, axis=0)
         if key.find("fc1.") != -1:
             fc = 'fc1'
@@ -237,10 +208,7 @@ def split_and_convert_process(key, val, factor, saved_dir):
             saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{fc}.weight.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("fc1.bias") != -1 or key.find("fc2.bias") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         if key.find("fc1.") != -1:
             fc = 'fc1'
         else:
@@ -251,18 +219,12 @@ def split_and_convert_process(key, val, factor, saved_dir):
             saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{fc}.bias.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("final_layer_norm.weight") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         layer = int(key.split('layers.')[1].split('.final_layer_norm.')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.final_layer_norm.weight.bin"
         val.tofile(saved_path.as_posix())
     elif key.find("final_layer_norm.bias") != -1:
-        if key.find("encoder") != -1:
-            prefix = "encoder"
-        else:
-            prefix = "decoder"
+        prefix = get_encoder_or_decoder(key)
         layer = int(key.split('layers.')[1].split('.final_layer_norm.')[0])
         saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.final_layer_norm.bias.bin"
         val.tofile(saved_path.as_posix())

From 07809134527c73064bfb8e65afb382eee9085edc Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sat, 23 Sep 2023 21:24:48 -0700
Subject: [PATCH 254/262] commit

---
 .../bart/utils/huggingface_bart_ckpt_convert.py   | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index c637cea28..f03db7b6d 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -80,6 +80,11 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
 def get_encoder_or_decoder(key):
     return "encoder" if key.find("encoder") != -1 else "decoder"
 
+
+def get_fc(key):
+    return "fc1" if key.find("fc1.") != -1 else "fc2"
+
+
 def split_and_convert_process(key, val, factor, saved_dir):
     if val.ndim == 2:
         val = val.transpose(1, 0)
@@ -199,20 +204,14 @@ def split_and_convert_process(key, val, factor, saved_dir):
     elif key.find("fc1.weight") != -1 or key.find("fc2.weight") != -1:
         prefix = get_encoder_or_decoder(key)
         split_vals = np.split(val, factor, axis=0)
-        if key.find("fc1.") != -1:
-            fc = 'fc1'
-        else:
-            fc = 'fc2'
+        fc = get_fc(key)
         layer = int(key.split('layers.')[1].split(f'.{fc}.')[0])
         for j in range(factor):
             saved_path = saved_dir / f"{prefix}.{layer}.layer.SelfAttention.{fc}.weight.{j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
     elif key.find("fc1.bias") != -1 or key.find("fc2.bias") != -1:
         prefix = get_encoder_or_decoder(key)
-        if key.find("fc1.") != -1:
-            fc = 'fc1'
-        else:
-            fc = 'fc2'
+        fc = get_fc(key)
         layer = int(key.split('layers.')[1].split(f'.{fc}.')[0])
         split_vals = np.split(val, factor, axis=0)
         for j in range(factor):

From ac96919f04118ef08729828bce6195a55df09098 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sat, 23 Sep 2023 21:25:50 -0700
Subject: [PATCH 255/262] commit

---
 .../pytorch/bart/utils/huggingface_bart_ckpt_convert.py     | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index f03db7b6d..9cdf5f375 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -19,12 +19,6 @@
 import logging
 from pathlib import Path
 
-import sys
-import os
-
-dir_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(dir_path + "/../../../../3rdparty/transformers/src/")
-
 from transformers import BartForConditionalGeneration
 
 import numpy as np

From edfd4b901f875daf23271a6ca55abf52d6991b02 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sat, 23 Sep 2023 21:38:10 -0700
Subject: [PATCH 256/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 9cdf5f375..f6700a241 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -19,7 +19,7 @@
 import logging
 from pathlib import Path
 
-from transformers import BartForConditionalGeneration
+from transformers import BartModel
 
 import numpy as np
 import torch  # pytype: disable=import-error
@@ -238,7 +238,7 @@ def convert_checkpoint(args):
     saved_dir = Path(args.saved_dir) / f"{args.inference_tensor_para_size:d}-gpu"
     saved_dir.mkdir(parents=True, exist_ok=True)
 
-    bart_model = BartForConditionalGeneration.from_pretrained(args.in_file)
+    bart_model = BartModel.from_pretrained(args.in_file)
     hf_config = vars(bart_model.config)
     config = configparser.ConfigParser()
 

From cba8f4808ffa2745d8efc12ffd4539d60d41cf2a Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sat, 23 Sep 2023 21:39:53 -0700
Subject: [PATCH 257/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index f6700a241..f96412fa4 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -39,6 +39,7 @@ def get_weight_data_type(data_type):
 def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
     model_dict = {}
     for name, param in model.named_parameters():
+        print(name)
         if name.find("self_attn") == -1 or name.find("decoder.layers") == -1:
             continue
         if name.find(".q_proj.") != -1 or name.find(".k_proj.") != -1 or name.find(".v_proj.") != -1:

From ef1912d71ceeecadb2bb785813303ed775652c95 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sat, 23 Sep 2023 21:41:09 -0700
Subject: [PATCH 258/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py     | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index f96412fa4..afa9514a8 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -46,10 +46,10 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
             model_dict[name] = param
 
     for i in range(model.config.decoder_layers):
-        shape = model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"].T.shape
-        qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"].T,
-                         model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"].T,
-                         model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"].T], dim=-1)
+        shape = model_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"].T.shape
+        qkv = torch.cat([model_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"].T,
+                         model_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"].T,
+                         model_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"].T], dim=-1)
 
         qkv = qkv.reshape([shape[0], 3, shape[1]])
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
@@ -60,10 +60,10 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
             split_vals[j].tofile(saved_path.as_posix())
 
     for i in range(model.config.decoder_layers):
-        shape = model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"].shape
-        qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"],
-                         model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"],
-                         model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"]], dim=-1)
+        shape = model_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"].shape
+        qkv = torch.cat([model_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"],
+                         model_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"],
+                         model_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"]], dim=-1)
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
 
         split_vals = np.split(qkv, factor, axis=-1)
@@ -272,8 +272,6 @@ def convert_checkpoint(args):
     np_weight_data_type = get_weight_data_type(args.weight_data_type)
 
     i_gpu_num = args.inference_tensor_para_size
-    # for name, param in bart_model.state_dict().items():
-    #     split_and_convert_process(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
     pool = multiprocessing.Pool(args.processes)
     pool.starmap_async(split_and_convert_process,
                        [(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)

From c9911e1f410c4d8b87ed364b93613d7cf9514c87 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sat, 23 Sep 2023 21:43:16 -0700
Subject: [PATCH 259/262] commit

---
 .../utils/huggingface_bart_ckpt_convert.py    | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index afa9514a8..9cdf5f375 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -19,7 +19,7 @@
 import logging
 from pathlib import Path
 
-from transformers import BartModel
+from transformers import BartForConditionalGeneration
 
 import numpy as np
 import torch  # pytype: disable=import-error
@@ -39,17 +39,16 @@ def get_weight_data_type(data_type):
 def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
     model_dict = {}
     for name, param in model.named_parameters():
-        print(name)
         if name.find("self_attn") == -1 or name.find("decoder.layers") == -1:
             continue
         if name.find(".q_proj.") != -1 or name.find(".k_proj.") != -1 or name.find(".v_proj.") != -1:
             model_dict[name] = param
 
     for i in range(model.config.decoder_layers):
-        shape = model_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"].T.shape
-        qkv = torch.cat([model_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"].T,
-                         model_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"].T,
-                         model_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"].T], dim=-1)
+        shape = model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"].T.shape
+        qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"].T,
+                         model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"].T,
+                         model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"].T], dim=-1)
 
         qkv = qkv.reshape([shape[0], 3, shape[1]])
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
@@ -60,10 +59,10 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type):
             split_vals[j].tofile(saved_path.as_posix())
 
     for i in range(model.config.decoder_layers):
-        shape = model_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"].shape
-        qkv = torch.cat([model_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"],
-                         model_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"],
-                         model_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"]], dim=-1)
+        shape = model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"].shape
+        qkv = torch.cat([model_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"],
+                         model_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"],
+                         model_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"]], dim=-1)
         qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type)
 
         split_vals = np.split(qkv, factor, axis=-1)
@@ -239,7 +238,7 @@ def convert_checkpoint(args):
     saved_dir = Path(args.saved_dir) / f"{args.inference_tensor_para_size:d}-gpu"
     saved_dir.mkdir(parents=True, exist_ok=True)
 
-    bart_model = BartModel.from_pretrained(args.in_file)
+    bart_model = BartForConditionalGeneration.from_pretrained(args.in_file)
     hf_config = vars(bart_model.config)
     config = configparser.ConfigParser()
 
@@ -272,6 +271,8 @@ def convert_checkpoint(args):
     np_weight_data_type = get_weight_data_type(args.weight_data_type)
 
     i_gpu_num = args.inference_tensor_para_size
+    # for name, param in bart_model.state_dict().items():
+    #     split_and_convert_process(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
     pool = multiprocessing.Pool(args.processes)
     pool.starmap_async(split_and_convert_process,
                        [(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)

From 2de726fb9379a09cc966759273eefdab9943166b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sat, 23 Sep 2023 21:43:25 -0700
Subject: [PATCH 260/262] commit

---
 examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
index 9cdf5f375..2f6dc2c1d 100644
--- a/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
+++ b/examples/pytorch/bart/utils/huggingface_bart_ckpt_convert.py
@@ -271,8 +271,6 @@ def convert_checkpoint(args):
     np_weight_data_type = get_weight_data_type(args.weight_data_type)
 
     i_gpu_num = args.inference_tensor_para_size
-    # for name, param in bart_model.state_dict().items():
-    #     split_and_convert_process(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)
     pool = multiprocessing.Pool(args.processes)
     pool.starmap_async(split_and_convert_process,
                        [(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir)

From 179061956cb1045b01fa4286d51019fabf87d49d Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 24 Sep 2023 12:02:19 -0700
Subject: [PATCH 261/262] commit

---
 src/fastertransformer/models/bart/BartDecoding.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/fastertransformer/models/bart/BartDecoding.h b/src/fastertransformer/models/bart/BartDecoding.h
index 50bf6edd3..e3e23ed6d 100644
--- a/src/fastertransformer/models/bart/BartDecoding.h
+++ b/src/fastertransformer/models/bart/BartDecoding.h
@@ -27,17 +27,6 @@
 
 namespace fastertransformer {
 
-// fallback to fp32 dynamic decoder when bf16 specified
-template<typename T>
-struct fallBackType {
-    using Type = float;
-};
-
-template<>
-struct fallBackType<half> {
-    using Type = half;
-};
-
 template<typename T>
 class BartDecoding: public BaseLayer {
 private:

From b997e177ff37d099597de5f884a41d1a59623588 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Sun, 24 Sep 2023 14:44:19 -0700
Subject: [PATCH 262/262] commit

---
 src/fastertransformer/layers/DynamicDecodeLayer.h | 11 +++++++++++
 src/fastertransformer/models/t5/T5Decoding.h      | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/fastertransformer/layers/DynamicDecodeLayer.h b/src/fastertransformer/layers/DynamicDecodeLayer.h
index 3b63cda92..774300731 100644
--- a/src/fastertransformer/layers/DynamicDecodeLayer.h
+++ b/src/fastertransformer/layers/DynamicDecodeLayer.h
@@ -26,6 +26,17 @@
 
 namespace fastertransformer {
 
+// fallback to fp32 dynamic decoder when bf16 specified
+template<typename T>
+struct fallBackType {
+    using Type = float;
+};
+
+template<>
+struct fallBackType<half> {
+    using Type = half;
+};
+
 template<typename T>
 class DynamicDecodeLayer: public BaseLayer {
 protected:
diff --git a/src/fastertransformer/models/t5/T5Decoding.h b/src/fastertransformer/models/t5/T5Decoding.h
index 67f04d480..cf74652a9 100644
--- a/src/fastertransformer/models/t5/T5Decoding.h
+++ b/src/fastertransformer/models/t5/T5Decoding.h
@@ -27,17 +27,6 @@
 
 namespace fastertransformer {
 
-// fallback to fp32 dynamic decoder when bf16 specified
-template<typename T>
-struct fallBackType {
-    using Type = float;
-};
-
-template<>
-struct fallBackType<half> {
-    using Type = half;
-};
-
 template<typename T>
 class T5Decoding: public BaseLayer {
 private: