From 743369a60a6054257983375ac5008e3f19129032 Mon Sep 17 00:00:00 2001
From: Asim Shankar <asim.shankar@snowflake.com>
Date: Mon, 10 Jul 2023 19:24:07 -0700
Subject: [PATCH 01/79] Merge with main (#1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update beam_search_topk_kernels.cu

fix: fix bug of beam search

* fix: change int of some kernels to int64_t to prevent overflow

* fix: gpt tensor shapes inconsistency (#505)

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* Update gpt_guide.md (#529)

* fix: fix bug of gpt buffer and gpt gemm overflow

* Update T5DecodingWeight.cc

fix: fix loading bug of t5

* [Enhancement]add pytorch backend support for gptneox (#550)

* add pytorch backend support for gptneox

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* fix early stopping invalid

* 1) Some unused parameters and logic have been removed. 2) Revisions that would affect pipeline parallelism have been reverted. 3) The code has been made capable of direct validation on TabbyML/NeoX-1.3B.

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* Change the names of classes, removing 'parallel' from their names

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* Format the code.

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* Only print results when rank is 0.

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* Add dist.init_process_group().

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* update docs

Signed-off-by: AkiyamaYummy <842720660@qq.com>

---------

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* Update cublasMMWrapper.cc

Fix the CUBLAS_VERSION checking of cublasMMWrapper

* Update cublasMMWrapper.cc

* fix overflow in softmax_kernel when process long seqlen and big batch_size (#524)

* Update unfused_attention_kernels.cu

fix bug of softmax kernel

* [Enhancement]create huggingface_gptneox_convert.py (#569)

* create huggingface_gptneox_convert.py

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* adjust HF's multi bin files

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* update gptneox_guide.md

Signed-off-by: AkiyamaYummy <842720660@qq.com>

---------

Signed-off-by: AkiyamaYummy <842720660@qq.com>

* perf(bloom): improve performance of huggingface_bloom_convert.py, decrease the time cost and the mem using (#568)

Co-authored-by: r.yang <r.yang@tianrang-inc.com>

* Fix/gpt early stop (#584)

* fix: fix bug of early stopping of gpt

* [bugfix] Fix 2-shot All Reduce correctness issue (indexing bug). (#672)

FasterTransformer 2-shot all reduce is implemented as a reduce-scatter + all-gather. There is an indexing bug in the all-gather step. Prior to this change, 2-shot all reduce was only producing correct results on device 0. Now, all devices have the correct results.

* fix: swap tensor bug (#683)

* Support size_per_head=112 (#660)

* fix multi-gpu build

* add support for size_per_head=112 for gpt decoder

* remove mpi_cxx from multi-gpu build for now (#705)

---------

Signed-off-by: AkiyamaYummy <842720660@qq.com>
Co-authored-by: byshiue <bhsueh@nvidia.com>
Co-authored-by: _yummy_ <842720660@qq.com>
Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: zhangxin81 <115389973+zhangxin81@users.noreply.github.com>
Co-authored-by: 杨睿 <595403043@qq.com>
Co-authored-by: r.yang <r.yang@tianrang-inc.com>
Co-authored-by: Rahul Kindi <rkindi@users.noreply.github.com>
Co-authored-by: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com>
Co-authored-by: Daya Khudia <37562707+dskhudia@users.noreply.github.com>
Co-authored-by: Dean Wyatte <2512762+dwyatte@users.noreply.github.com>
---
 README.md                                     |   4 +
 docs/gpt_guide.md                             |   2 +-
 docs/gptneox_guide.md                         |  56 ++-
 .../cpp/multi_gpu_gpt/gpt_example_utils.cc    |   2 +-
 .../gpt/utils/huggingface_bloom_convert.py    | 192 ++++++++--
 examples/pytorch/gptneox/gptneox_example.py   | 226 ++++++++++++
 examples/pytorch/gptneox/utils/gptneox.py     | 317 ++++++++++++++++
 .../utils/huggingface_gptneox_convert.py      | 251 +++++++++++++
 .../kernels/beam_search_topk_kernels.cu       |   4 +-
 .../kernels/custom_ar_kernels.cu              |   4 +-
 .../decoder_masked_multihead_attention.cu     |   3 +
 .../decoder_masked_multihead_attention_112.cu | 101 +++++
 .../kernels/decoding_kernels.cu               |  48 +--
 src/fastertransformer/kernels/gpt_kernels.cu  |  48 ++-
 src/fastertransformer/kernels/gpt_kernels.h   |   1 +
 .../kernels/stop_criteria_kernels.cu          |   2 +-
 .../kernels/unfused_attention_kernels.cu      |  19 +-
 .../layers/TensorParallelGeluFfnLayer.cc      |   2 +
 .../layers/TensorParallelReluFfnLayer.cc      |   2 +
 .../layers/TensorParallelSiluFfnLayer.cc      |   2 +
 .../DecoderCrossAttentionLayer.cu             |   6 +-
 .../DecoderSelfAttentionLayer.cc              |   4 +-
 ...ensorParallelDecoderCrossAttentionLayer.cc |   4 +-
 ...TensorParallelDecoderSelfAttentionLayer.cc |   4 +-
 ...ensorParallelDisentangledAttentionLayer.cc |   4 +-
 .../TensorParallelGptContextAttentionLayer.cc |   4 +-
 .../TensorParallelUnfusedAttentionLayer.cc    |   4 +-
 src/fastertransformer/models/bert/Bert.cc     |   4 +-
 .../gptneox/GptNeoXDecoderLayerWeight.h       |   2 +-
 .../models/gptneox/GptNeoXWeight.cc           |  10 +
 .../models/gptneox/GptNeoXWeight.h            |   7 +
 .../models/multi_gpu_gpt/ParallelGpt.cc       | 109 +++---
 .../models/multi_gpu_gpt/ParallelGpt.h        |   7 +-
 .../models/t5/T5DecodingWeight.cc             |   8 +-
 src/fastertransformer/th_op/CMakeLists.txt    |   3 +
 src/fastertransformer/th_op/common/GptOps.cc  |   1 +
 .../th_op/gptneox/CMakeLists.txt              |  17 +
 .../th_op/gptneox/GptNeoXOp.cc                | 164 +++++++++
 .../th_op/gptneox/GptNeoXOp.h                 | 346 ++++++++++++++++++
 .../utils/cublasMMWrapper.cc                  |   4 +-
 .../utils/gemm_test/gpt_gemm_func.cc          |  16 +-
 .../utils/gemm_test/gpt_gemm_func.h           |  16 +-
 ...used_self_multihead_attention_unit_test.py |   6 +-
 tests/unittests/test_gpt_kernels.cu           |   1 +
 44 files changed, 1867 insertions(+), 170 deletions(-)
 create mode 100644 examples/pytorch/gptneox/gptneox_example.py
 create mode 100755 examples/pytorch/gptneox/utils/gptneox.py
 create mode 100644 examples/pytorch/gptneox/utils/huggingface_gptneox_convert.py
 create mode 100644 src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_112.cu
 create mode 100755 src/fastertransformer/th_op/gptneox/CMakeLists.txt
 create mode 100755 src/fastertransformer/th_op/gptneox/GptNeoXOp.cc
 create mode 100755 src/fastertransformer/th_op/gptneox/GptNeoXOp.h

diff --git a/README.md b/README.md
index a82098cd4..a00e0d631 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,7 @@ FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide
 | Swin Transformer | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |
 | ViT              | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |
 | ViT              | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| GPT-NeoX         | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
 | GPT-NeoX         | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
 | BART/mBART       | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
 | WeNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |
@@ -212,6 +213,9 @@ In the experiments of decoding, we updated the following parameters:
 
 ### Changelog
 
+May 2023
+- Fix bugs of generation early stopping
+
 January 2023
 - Support GPT MoE
 - Support FP8 for Bert and GPT (**Experimental**)
diff --git a/docs/gpt_guide.md b/docs/gpt_guide.md
index 4be09d411..4a10c1d46 100644
--- a/docs/gpt_guide.md
+++ b/docs/gpt_guide.md
@@ -458,7 +458,7 @@ python ../examples/pytorch/gpt/utils/huggingface_gpt_convert.py -i gpt2-xl/ -o .
 
 2. Run GPT on PyTorch
 
-    Basically, `gpt_example.py` includes the example how to declare a model, load a ckeckpoint, and forward context inputs and get generated outputs in Pytorch.
+    Basically, `gpt_example.py` includes the example how to declare a model, load a checkpoint, and forward context inputs and get generated outputs in Pytorch.
 
     For generating outputs based on context inputs, create a text file including the context inputs (line by line) and set `--sample_file_input` to the text file path. (By default, the script will generate outputs without context inputs.) Set `--sample_file_output` to write the outputs to a file. Use `--data_type fp16/bf16` to run in FP16 or BF16.
 
diff --git a/docs/gptneox_guide.md b/docs/gptneox_guide.md
index 4a443fae8..dcedbe8ed 100644
--- a/docs/gptneox_guide.md
+++ b/docs/gptneox_guide.md
@@ -36,6 +36,7 @@ We provide the environment variables to tune for specific usage.
 
 * Checkpoint converter
   * EleutherAI
+  * HuggingFace
 * Data type
   * FP32
   * FP16
@@ -46,7 +47,7 @@ We provide the environment variables to tune for specific usage.
   * Bad words list
   * Beam search and sampling are both supported
 
-## Setup
+## Setup from EleutherAI checkpoint
 
 ### Requirements
 
@@ -72,6 +73,22 @@ You may download the tokenizer config [here](https://mystic.the-eye.eu/public/AI
 
 To tokenize/detokenize files, use the script found in `examples/pytorch/gptneox/utils/hftokenizer.py`. You may need to pass the path to the tokenizer config with the `--tokenizer` flag.
 
+## Setup from HuggingFace checkpoint
+
+> Please checkout https://huggingface.co/docs to learn more about the usage of the huggingface models and tokenizers.
+
+First download a huggingface checkpoint:
+
+```bash
+git lfs clone https://huggingface.co/<MODEL_GROUP>/<MODEL_NAME>
+```
+
+Then use the script provided by FasterTransformer to convert the checkpoint to raw weights, understood by FT. You can change `-i_g` to specify the tensor parallelism size.
+
+```bash
+python ../examples/pytorch/gptneox/utils/huggingface_gptneox_convert.py -i ../path/to/your/model -o ../../path/to/fastertransformer/model -i_g 1 -m_n gptneox
+```
+
 ### Run GPT-NeoX
 
 * Generate the `gemm_config.in` file.\
@@ -89,14 +106,39 @@ To tokenize/detokenize files, use the script found in `examples/pytorch/gptneox/
     mpirun -n 2 --allow-run-as-root ./bin/gptneox_example
     ```
 
-E.g. by setting the `data_type` of `gptneox_config.ini` to `fp16`, users can run gpt model under fp16.
+    E.g. by setting the `data_type` of `gptneox_config.ini` to `fp16`, users can run gpt model under fp16.
+
+    You can then decode the `out` file with the tokenizer:
 
-You can then decode the `out` file with the tokenizer:
+      ```bash
+      wget https://mystic.the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json
+      ../examples/pytorch/gptneox/utils/hftokenizer.py out --tokenizer 20B_tokenizer.json
+      ```
+
+* Run GPT on PyTorch
+
+    Basically, `gptneox_example.py` includes the example how to declare a model, load a checkpoint, and forward context inputs and get generated outputs in Pytorch.
+
+    For generating outputs based on context inputs, create a text file including the context inputs (line by line) and set `--sample_input_file` to the text file path. (By default, the script will generate outputs without context inputs.)
+
+    Run with `-h` to see more settings.
+
+    Run GPT with TP and PP on single node. Note that the number of processes must equal to `tensor_para_size * pipeline_para_size`.
+
+    ```bash
+    # No parallelism (tensor_para_size=1, pipeline_para_size=1)
+    python ../examples/pytorch/gptneox/gptneox_example.py
+
+    # TP (tensor_para_size=2, pipeline_para_size=1)
+    mpirun -n 2 --allow-run-as-root python ../examples/pytorch/gptneox/gptneox_example.py --tensor_para_size=2 --pipeline_para_size=1 --ckpt_path="/path/to/your/model/2-gpu"
+
+    # LP (tensor_para_size=1, pipeline_para_size=2)
+    mpirun -n 2 --allow-run-as-root python ../examples/pytorch/gptneox/gptneox_example.py --tensor_para_size=1 --pipeline_para_size=2 --ckpt_path="/path/to/your/model/1-gpu"
+
+    # TP and LP (tensor_para_size=2, pipeline_para_size=2)
+    mpirun -n 4 --allow-run-as-root python ../examples/pytorch/gptneox/gptneox_example.py --tensor_para_size=2 --pipeline_para_size=2 --ckpt_path="/path/to/your/model/2-gpu"
+    ```
 
-  ```bash
-  wget https://mystic.the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json
-  ../examples/pytorch/gptneox/utils/hftokenizer.py out --tokenizer 20B_tokenizer.json
-  ```
 <!-- This converter only works for customed checkpoint -->
 <!-- ### Run GPT-NeoX with prompts
 
diff --git a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
index f06ad8a0b..578fbc90b 100644
--- a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
+++ b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
@@ -430,7 +430,7 @@ void populate_request(std::unordered_map<std::string, Tensor>& input_tensors,
     }
 
     if (request_config.is_return_context_embeddings) {
-        deviceMalloc(&output_context_embeddings, request_batch_size * model_config.hidden_units);
+        deviceMalloc(&output_context_embeddings, request_batch_size * beam_width * model_config.hidden_units);
         output_tensors.insert({"context_embeddings",
                                {MEMORY_GPU,
                                 TYPE_FP32,
diff --git a/examples/pytorch/gpt/utils/huggingface_bloom_convert.py b/examples/pytorch/gpt/utils/huggingface_bloom_convert.py
index 7c78711f4..60adad2db 100644
--- a/examples/pytorch/gpt/utils/huggingface_bloom_convert.py
+++ b/examples/pytorch/gpt/utils/huggingface_bloom_convert.py
@@ -20,11 +20,12 @@
 import configparser
 import logging
 import multiprocessing
+import os
 import re
 import time
 
 from pathlib import Path
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
 
 import numpy as np
 import torch
@@ -77,6 +78,9 @@ def get_args():
     parser.add_argument(
         '-v', '--verbose', action='store_true',
         help='Enable verbose logging')
+    parser.add_argument(
+        '-s', '--by-shard', action='store_true',
+        help='Process shard by shard, enable when converting big model like bloom 175B')
     _args = parser.parse_args()
 
     set_logger(_args.verbose)
@@ -301,40 +305,176 @@ def save_bloom_config(model_config: BloomConfig, save_dir: PathLike):
         config.write(f, space_around_delimiters=False)
 
 
+def load_state_dict(file_path: Path, dtype: torch.dtype) -> Dict[str, torch.Tensor]:
+    """ Load weights from model file
+
+    `safetensors` or `pytorch binary` is supported
+
+    # Args.
+        file_path: model file path, ends with .bin or .safetensors.
+        dtype: torch.dtype, data type.
+    # Returns.
+        Dict[str, torch.Tensor]
+    """
+
+    state_dict = {}
+    if file_path.suffix == ".safetensors":
+        # load from safetensors file
+        from safetensors import safe_open
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            for k in f.keys():
+                state_dict[k] = f.get_tensor(k).type(dtype)
+    else:
+        # load from pytorch bin file
+        state_dict = torch.load(file_path, map_location="cpu")
+        for k in state_dict:
+            state_dict[k] = state_dict[k].type(dtype)
+    return state_dict
+
+
+def get_model_files(model_name: str) -> List[Path]:
+    """ List all model files that you want to load and convert
+
+    # Args.
+        model_name: name(like `bigscience/bloom`) or local directory of the model
+    # Returns.
+        List[Path] model file paths
+    """
+
+    import glob
+    from huggingface_hub import try_to_load_from_cache
+
+    model_dir = model_name
+
+    # get the local model directory
+    try:
+        config_file = "config.json"
+        # will fall back to HUGGINGFACE_HUB_CACHE
+        config_path = try_to_load_from_cache(
+            model_name, config_file, cache_dir=os.getenv("TRANSFORMERS_CACHE")
+        )
+
+        if config_path is not None:
+            # treat the model name as an huggingface model path
+            model_dir = os.path.dirname(config_path)
+    except:
+        # treat the model name as an explicit model path
+        pass
+
+    model_files = glob.glob(model_dir + "/*.bin")
+    try:
+        from safetensors import safe_open as _
+
+        st_files = glob.glob(model_dir + "/*.safetensors")
+        if st_files:
+            model_files = st_files
+        logger.info("loading from safetensors format")
+    except ImportError:
+        logger.info("loading from pytorch bin format")
+
+    if not model_files:
+        raise FileNotFoundError('model files not found')
+
+    logger.info(f"model file num: {len(model_files)}")
+    return [Path(i) for i in model_files]
+
+
+def process_by_model_param(model_id: str, dtype: torch.dtype, tp_size: int, save_dir: Path, nproc: int):
+    """ Process conversion parameter by parameter.
+    """
+
+    # init bloom config
+    model_config = BloomConfig.from_pretrained(model_id)
+    # list all model files
+    model_files = get_model_files(model_id)
+    # save bloom config to output dir
+    save_bloom_config(model_config, save_dir)
+
+    if nproc > 1:
+        pool = multiprocessing.Pool(nproc)
+        star_args = []
+        for model_file in model_files:
+            state_dict = load_state_dict(model_file, dtype)
+            for name in state_dict:
+                param = state_dict[name]
+                # Preprocess
+                param_name = convert_parameter_name(name)
+                param = safe_transpose(param)
+                param = handle_exceptions(model_config, param_name, param)
+                star_args.append((param_name, param.detach().cpu().numpy(), tp_size, save_dir))
+        pool.starmap_async(convert_and_save_parameter, star_args)
+        pool.close()
+        pool.join()
+    else:
+        for model_file in model_files:
+            state_dict = load_state_dict(model_file, dtype)
+            for name in state_dict:
+                param = state_dict[name]
+                # Preprocess
+                param_name = convert_parameter_name(name)
+                param = safe_transpose(param)
+                param = handle_exceptions(model_config, param_name, param)
+                convert_and_save_parameter(param_name, param.detach().cpu().numpy(), tp_size, save_dir)
+
+
+def _process_by_model_shard(model_config, model_file, dtype: torch.dtype, tp_size: int, save_dir: Path):
+    state_dict = load_state_dict(model_file, dtype)
+    for name in state_dict:
+        param = state_dict[name]
+        # Preprocess
+        param_name = convert_parameter_name(name)
+        param = safe_transpose(param)
+        param = handle_exceptions(model_config, param_name, param)
+        convert_and_save_parameter(param_name, param.detach().cpu().numpy(), tp_size, save_dir)
+
+
+def process_by_model_shard(model_id: str, dtype: torch.dtype, tp_size: int, save_dir: Path, nproc: int):
+    """ Process conversion shard by shard.
+
+    Benchmarks @ 64C(Intel Xeon 6326 2.90GH) x 756G:
+
+        | model      | format           | by-shard | nproc | elapsed(s) | mem  |
+        |------------|------------------|----------|-------|------------|------|
+        | bloom-175b | safetensors x 72 | NO       | 8     | 1516.66    | 350G |
+        | bloom-175b | safetensors x 72 | YES      | 8     | 1165.03    | 50G  |
+        | bloom-175b | safetensors x 72 | YES      | 24    | 494.81     | 150G |
+
+    """
+
+    # init bloom config
+    model_config = BloomConfig.from_pretrained(model_id)
+    # list all model files
+    model_files = get_model_files(model_id)
+    # save bloom config to output dir
+    save_bloom_config(model_config, save_dir)
+
+    if nproc > 1:
+        pool = multiprocessing.Pool(nproc)
+        star_args = []
+        for model_file in model_files:
+            star_args.append((model_config, model_file, dtype, tp_size, save_dir))
+        pool.starmap_async(_process_by_model_shard, star_args)
+        pool.close()
+        pool.join()
+    else:
+        for model_file in model_files:
+            _process_by_model_shard(model_config, model_file, dtype, tp_size, save_dir)
+
+
 def main():
+    start_time = time.time()
     args = get_args()
     tp_size = args.tensor_para_size
-
     dtype = DATATYPE_MAP[args.data_type]
-    model = AutoModel.from_pretrained(args.input_dir).cpu().type(dtype)
-    assert isinstance(model, torch.nn.Module)
 
     save_dir = Path(args.output_dir) / f'{tp_size}-gpu'
     save_dir.mkdir(exist_ok=True, parents=True)
-    save_bloom_config(model.config, save_dir)
 
-    start_time = time.time()
-    logger.info(f'Start the checkpoint conversion: '
-                f'{len(list(model.parameters()))} params')
-    if args.processes > 1:
-        pool = multiprocessing.Pool(args.processes)
-        star_args = []
-        for name, param in model.named_parameters():
-            # Preprocess
-            param_name = convert_parameter_name(name)
-            param = safe_transpose(param)
-            param = handle_exceptions(model.config, param_name, param)
-            star_args.append((param_name, param.detach().cpu().numpy(), tp_size, save_dir))
-        pool.starmap_async(convert_and_save_parameter, star_args)
-        pool.close()
-        pool.join()
+    if args.by_shard:
+        process_by_model_shard(args.input_dir, dtype, tp_size, save_dir, args.processes)
     else:
-        for name, param in model.named_parameters():
-            # Preprocess
-            param_name = convert_parameter_name(name)
-            param = safe_transpose(param)
-            param = handle_exceptions(model.config, param_name, param)
-            convert_and_save_parameter(param_name, param.detach().cpu().numpy(), tp_size, save_dir)
+        process_by_model_param(args.input_dir, dtype, tp_size, save_dir, args.processes)
+
     elapsed_time = time.time() - start_time
     logger.info(f'Checkpoint conversion (HF >> FT) has done '
                 f'(elapsed time: {elapsed_time:.2f} sec)')
diff --git a/examples/pytorch/gptneox/gptneox_example.py b/examples/pytorch/gptneox/gptneox_example.py
new file mode 100644
index 000000000..ca59c48d5
--- /dev/null
+++ b/examples/pytorch/gptneox/gptneox_example.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from torch.nn.utils.rnn import pad_sequence
+import random
+import os
+import sys
+import argparse
+import configparser
+import timeit
+import torch
+import torch.distributed as dist
+import numpy as np
+from transformers import AutoTokenizer
+dir_path = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(dir_path + "/../../..")
+from examples.pytorch.gptneox.utils.gptneox import GptNeoX
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_len', type=int, default=32,
+                        help='output sequence length to generate.')
+    parser.add_argument('--beam_width', type=int, default=1,
+                        help='beam width for beam search. Using sampling when beam width is 1.')
+    parser.add_argument('--top_k', type=int, default=1,
+                        help='top k candidate num')
+    parser.add_argument('--top_p', type=float, default=0.,
+                        help='top p probability threshold')
+    parser.add_argument('--temperature', type=float, default=1.,
+                        help='temperature')
+    parser.add_argument('--len_penalty', type=float, default=0.,
+                        help='len_penalty')
+    parser.add_argument('--beam_search_diversity_rate', type=float, default=0.,
+                        help='beam_search_diversity_rate')
+    parser.add_argument('--tensor_para_size', type=int, default=1,
+                        help='tensor parallel size')
+    parser.add_argument('--pipeline_para_size', type=int, default=1,
+                        help='pipeline parallel size')
+    parser.add_argument('--ckpt_path', type=str, default='../models/gptneox/c-model/NeoX-1.3B/1-gpu',
+                        help='path to the checkpoint file.')
+    parser.add_argument('--tokenizer_path', type=str, default='../models/gptneox/model/NeoX-1.3B',
+                        help='directory where the tokenizer file is located.')
+    parser.add_argument('--lib_path', type=str, default='./lib/libth_transformer.so',
+                        help='path to the pyt_fastertransformer dynamic lib file.')
+    parser.add_argument('--sample_input_file', type=str,
+                        help='path to the sample input file.')
+    parser.add_argument('--max_batch_size', type=int, default=8,
+                        help='max batch size.')
+    parser.add_argument('--repetition_penalty', type=float, default=1.,
+                        help='repetition penalty')
+    parser.add_argument('--max_seq_len', type=int, default=1024,
+                        help='max sequence length for position embedding table.')
+    parser.add_argument('--inference_data_type', '--data_type', type=str, choices=['fp32', 'fp16'], default='fp16')
+    parser.add_argument('--time', action='store_true',
+                        help='whether or not to measure time elapsed.')
+    parser.add_argument('--enable_random_seed', action='store_true',
+                        help='is enable the random seed.')
+
+    args = parser.parse_args()
+
+    config = configparser.ConfigParser()
+    config.read(os.path.join(args.ckpt_path, "config.ini"))
+    head_num = int(config.get('gptneox', 'head_num'))
+    size_per_head = int(config.get('gptneox', 'size_per_head'))
+    vocab_size = int(config.get('gptneox', 'vocab_size'))
+    layer_num = int(config.get('gptneox', 'num_layer'))
+    rotary_embedding = int(config.get('gptneox', 'rotary_embedding'))
+    start_id = int(config.get('gptneox', 'start_id'))
+    end_id = int(config.get('gptneox', 'end_id'))
+    use_gptj_residual = (config.get('gptneox', 'use_gptj_residual') == "1")
+    weight_data_type = config.get('gptneox', 'weight_data_type')
+
+    ckpt_path = args.ckpt_path
+    tokenizer_path = args.tokenizer_path
+    lib_path = args.lib_path
+    output_len = args.output_len
+    beam_width = args.beam_width
+    top_k = args.top_k
+    top_p = args.top_p
+    temperature = args.temperature
+    len_penalty = args.len_penalty
+    beam_search_diversity_rate = args.beam_search_diversity_rate
+    tensor_para_size = args.tensor_para_size
+    pipeline_para_size = args.pipeline_para_size
+    max_batch_size = args.max_batch_size
+    max_seq_len = args.max_seq_len
+    repetition_penalty = args.repetition_penalty
+    inference_data_type = args.inference_data_type
+
+    print("\n=============== Arguments ===============")
+    for arg in vars(args):
+        print("{}: {}".format(arg, getattr(args, arg)))
+    print("=========================================\n")
+
+    if tensor_para_size * pipeline_para_size > 1:
+        dist.init_process_group(backend=dist.Backend.MPI)
+    rank = dist.get_rank() if dist.is_initialized() else 0
+    device_count = dist.get_world_size() if dist.is_initialized() else 1
+    device = rank % device_count
+    torch.cuda.set_device(device)
+    device = torch.cuda.current_device()
+
+    # sentencepiece needed
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+    # Inputs
+    contexts = []
+    if args.sample_input_file:  # conditional case
+        with open(args.sample_input_file, "r") as f:
+            contexts = f.read().splitlines()
+            batch_size = min(len(contexts), max_batch_size)
+        contexts = contexts[:batch_size]
+        start_ids = [torch.tensor(tokenizer.encode(c), dtype=torch.int32, device=device) for c in contexts]
+    else:  # unconditional case
+        batch_size = max_batch_size
+        contexts = ['<|endoftext|>'] * batch_size
+        start_ids = [torch.IntTensor([end_id])] * batch_size
+
+    print("[INFO] batch size: {}".format(batch_size))
+
+    start_lengths = [len(ids) for ids in start_ids]
+
+    start_ids = pad_sequence(start_ids, batch_first=True, padding_value=end_id)
+    start_lengths = torch.IntTensor(start_lengths)
+
+    if args.enable_random_seed == True:
+        random_seed_tensor = torch.randint(0, 10000, size=[batch_size], dtype=torch.int64)
+    else:
+        random_seed_tensor = torch.zeros([batch_size], dtype=torch.int64)
+
+    # Prepare model.
+    gpt = GptNeoX(head_num, size_per_head, vocab_size, rotary_embedding,
+                  start_id, end_id, layer_num, max_seq_len, 
+                  tensor_para_size, pipeline_para_size, 
+                  use_gptj_residual, lib_path, 
+                  inference_data_type=inference_data_type, 
+                  weights_data_type=weight_data_type)
+    if not gpt.load(ckpt_path=ckpt_path):
+        print("[WARNING] Checkpoint file not found. Model loading is skipped.")
+
+    with torch.no_grad():
+        tokens_batch = gpt(
+            start_ids=start_ids,
+            start_lengths=start_lengths,
+            output_len=output_len,
+            beam_width=beam_width,
+            top_k=top_k * torch.ones(size=[batch_size], dtype=torch.int32),
+            top_p=top_p * torch.ones(size=[batch_size], dtype=torch.float32),
+            beam_search_diversity_rate=beam_search_diversity_rate * torch.ones(size=[batch_size], dtype=torch.float32),
+            temperature=temperature * torch.ones(size=[batch_size], dtype=torch.float32),
+            len_penalty=len_penalty * torch.ones(size=[batch_size], dtype=torch.float32),
+            repetition_penalty=repetition_penalty * torch.ones(size=[batch_size], dtype=torch.float32),
+            random_seed=random_seed_tensor,
+            return_output_length=False,
+            return_cum_log_probs=0)
+        if tokens_batch is not None and rank == 0:
+            tokens_batch = tokens_batch.cpu().numpy()
+            for i, (context, tokens) in enumerate(zip(contexts, tokens_batch)):
+                for beam_id in range(beam_width):
+                    token = tokens[beam_id][start_lengths[i]:]  # exclude context input from the output
+                    output = tokenizer.decode(token)
+                    print(f'[INFO] batch {i}, beam {beam_id}:\n[Context]\n{context}\n\n[Output]\n{output}\n')
+
+        # Measure inference time.
+        if args.time:
+            iterations = 10
+            # warmup
+            for i in range(iterations):
+                tokens_batch = gpt(
+                    start_ids=start_ids,
+                    start_lengths=start_lengths,
+                    output_len=output_len,
+                    beam_width=beam_width,
+                    top_k=top_k * torch.ones(size=[batch_size], dtype=torch.int32),
+                    top_p=top_p * torch.ones(size=[batch_size], dtype=torch.float32),
+                    beam_search_diversity_rate=beam_search_diversity_rate * torch.ones(size=[batch_size], dtype=torch.float32),
+                    temperature=temperature * torch.ones(size=[batch_size], dtype=torch.float32),
+                    len_penalty=len_penalty * torch.ones(size=[batch_size], dtype=torch.float32),
+                    repetition_penalty=repetition_penalty * torch.ones(size=[batch_size], dtype=torch.float32),
+                    random_seed=random_seed_tensor,
+                    return_output_length=False,
+                    return_cum_log_probs=0)
+
+            batch_num = 0
+            token_num = 0
+            time = timeit.default_timer()
+            for i in range(iterations):
+                tokens_batch = gpt(
+                    start_ids=start_ids,
+                    start_lengths=start_lengths,
+                    output_len=output_len,
+                    beam_width=beam_width,
+                    top_k=top_k * torch.ones(size=[batch_size], dtype=torch.int32),
+                    top_p=top_p * torch.ones(size=[batch_size], dtype=torch.float32),
+                    beam_search_diversity_rate=beam_search_diversity_rate * torch.ones(size=[batch_size], dtype=torch.float32),
+                    temperature=temperature * torch.ones(size=[batch_size], dtype=torch.float32),
+                    len_penalty=len_penalty * torch.ones(size=[batch_size], dtype=torch.float32),
+                    repetition_penalty=repetition_penalty * torch.ones(size=[batch_size], dtype=torch.float32),
+                    random_seed=random_seed_tensor,
+                    return_output_length=False,
+                    return_cum_log_probs=0)
+                batch_num += 1
+                for j, tokens in enumerate(tokens_batch):
+                    token_num += tokens.shape[-1] - start_lengths[j]
+            time_elapsed = timeit.default_timer() - time
+            throughput = token_num / time_elapsed
+            print(f"[INFO] FT-GPT generates {batch_num} batches, taking {time_elapsed:0.3f} secs "
+                  f"to generate {token_num} tokens, {throughput:0.3f} tokens/sec.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/pytorch/gptneox/utils/gptneox.py b/examples/pytorch/gptneox/utils/gptneox.py
new file mode 100755
index 000000000..df36488fe
--- /dev/null
+++ b/examples/pytorch/gptneox/utils/gptneox.py
@@ -0,0 +1,317 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import inspect
+import argparse
+import dataclasses
+import json
+import os
+import pathlib
+import typing
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.distributed as dist
+
+str_type_map = {"fp32": torch.float32, "fp16": torch.float16}
+
+class GptNeoXWeights(object):
+    def __init__(self, 
+                 head_num, size_per_head, layer_num, vocab_size, 
+                 max_seq_len, tensor_para_size, pipeline_para_size, use_gptj_residual, 
+                 inference_data_type: str = "fp16",
+                 weights_data_type: np.dtype = np.float32):
+        assert(head_num % tensor_para_size == 0)
+
+        self.head_num = head_num
+        self.size_per_head = size_per_head
+        self.layer_num = layer_num
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.tensor_para_size = tensor_para_size
+        self.pipeline_para_size = pipeline_para_size
+        self.layers_per_device = layer_num // pipeline_para_size
+
+        self.use_gptj_residual = use_gptj_residual
+
+        local_head_num = head_num // tensor_para_size
+        global_head_num = head_num
+        local_hidden_units = local_head_num * size_per_head
+        global_hidden_units = global_head_num * size_per_head
+        local_inter_size = local_hidden_units * 4
+
+        self.local_head_num = local_head_num
+        self.global_head_num = global_head_num
+        self.local_hidden_units = local_hidden_units
+        self.global_hidden_units = global_hidden_units
+        self.local_inter_size = local_inter_size
+
+        if isinstance(weights_data_type, str):
+            try:
+                weights_data_type = {
+                    "fp16": np.float16,
+                    "fp32": np.float32,
+                    "float16": np.float16,
+                    "float32": np.float32,
+                }[weights_data_type]
+            except KeyError:
+                raise ValueError(f"Don't know how to interpret weights_data_type: {weights_data_type}")
+
+        assert weights_data_type in [np.float32, np.float16]
+        self.weights_data_type = weights_data_type
+        self.inference_data_type = str_type_map[inference_data_type]
+
+        self.w = []
+        # Transformer blocks
+        self.w.extend([torch.zeros(global_hidden_units, dtype=self.inference_data_type)] * layer_num)                           # pre_layernorm_weights.beta
+        self.w.extend([torch.zeros(global_hidden_units, dtype=self.inference_data_type)] * layer_num)                           # pre_layernorm_weights.gamma
+        self.w.extend([torch.zeros(global_hidden_units, local_hidden_units * 3, dtype=self.inference_data_type)] * layer_num)   # self_attention_weights.query_weight.kernel
+        self.w.extend([torch.zeros(local_hidden_units * 3, dtype=self.inference_data_type)] * layer_num)                        # self_attention_weights.query_weight.bias
+        self.w.extend([torch.zeros(local_hidden_units, global_hidden_units, dtype=self.inference_data_type)] * layer_num)       # self_attention_weights.attention_output_weight.kernel
+        self.w.extend([torch.zeros(global_hidden_units, dtype=self.inference_data_type) if not use_gptj_residual else torch.empty(0)] * layer_num)
+                                                                                                # self_attention_weights.attention_output_weight.bias
+        
+        self.w.extend([torch.zeros(global_hidden_units, local_inter_size, dtype=self.inference_data_type)] * layer_num)         # ffn_weights.intermediate_weight.kernel
+        self.w.extend([torch.zeros(local_inter_size, dtype=self.inference_data_type)] * layer_num)                              # ffn_weights.intermediate_weight.bias
+        self.w.extend([torch.zeros(local_inter_size, global_hidden_units, dtype=self.inference_data_type)] * layer_num)         # ffn_weights.output_weight.kernel
+        self.w.extend([torch.zeros(global_hidden_units, dtype=self.inference_data_type)] * layer_num)                           # ffn_weights.output_weight.bias
+        
+        self.w.extend([torch.zeros(global_hidden_units, dtype=self.inference_data_type)] * layer_num)                           # post_attention_layernorm_weights.beta
+        self.w.extend([torch.zeros(global_hidden_units, dtype=self.inference_data_type)] * layer_num)                           # post_attention_layernorm_weights.gamma
+
+        # After Transformer blocks
+        self.w.append(torch.zeros(vocab_size, global_hidden_units, dtype=self.inference_data_type))                             # pre_decoder_embedding_table
+        self.w.append(torch.zeros(global_hidden_units, dtype=self.inference_data_type))                                         # post_decoder_layernorm.beta
+        self.w.append(torch.zeros(global_hidden_units, dtype=self.inference_data_type))                                         # post_decoder_layernorm.gamma
+        self.w.append(torch.zeros(vocab_size, global_hidden_units, dtype=self.inference_data_type))                             # post_decoder_embedding.kernel
+
+        # Initialization
+        self._map(lambda w: torch.nn.init.normal_(w, mean=0., std=0.01))
+
+    def __getitem__(self, idx):
+        return self.w[idx]
+
+    def __setitem__(self, idx, val):
+        self.w[idx] = val
+
+    def __len__(self):
+        return len(self.w)
+
+    def _map(self, func):
+        for i in range(len(self.w)):
+            if isinstance(self.w[i], list):
+                for j in range(len(self.w[i])):
+                    self.w[i][j] = func(self.w[i][j])
+            else:
+                self.w[i] = func(self.w[i])
+
+    def load(self, ckpt_path, tensor_para_rank, pipeline_para_rank):
+        
+        if not os.path.exists(ckpt_path):
+            return False
+        w = []
+
+        type_map = {np.float32: torch.float32, np.float16: torch.float16}
+        # Load
+        def is_load(i):
+            return i >= self.layers_per_device * pipeline_para_rank and i < self.layers_per_device * (pipeline_para_rank + 1)
+
+        file_names = ["input_layernorm.bias", 
+                      "input_layernorm.weight", 
+                      "attention.query_key_value.weight.%d" % tensor_para_rank,
+                      "attention.query_key_value.bias.%d" % tensor_para_rank,
+                      "attention.dense.weight.%d" % tensor_para_rank,
+                      "attention.dense.bias" if not self.use_gptj_residual else None,
+                      "mlp.dense_h_to_4h.weight.%d" % tensor_para_rank,
+                      "mlp.dense_h_to_4h.bias.%d" % tensor_para_rank,
+                      "mlp.dense_4h_to_h.weight.%d" % tensor_para_rank,
+                      "mlp.attention.bias.sum" if self.use_gptj_residual else "mlp.dense_4h_to_h.bias",
+                      "post_attention_layernorm.bias",
+                      "post_attention_layernorm.weight"]
+
+        for file_name in file_names:
+            for i in range(self.layer_num):
+                if file_name is not None and is_load(i):
+                    w.append(torch.from_numpy(np.fromfile(
+                                "%s/model.layers.%d.%s.bin" % (ckpt_path, i, file_name),
+                                dtype=self.weights_data_type)).to(self.inference_data_type))
+                else:
+                    w.append(torch.empty(0).to(self.inference_data_type))
+
+        w.append(torch.from_numpy(np.fromfile(ckpt_path + "/model.wte.bin", dtype=self.weights_data_type)).to(self.inference_data_type))
+        w.append(torch.from_numpy(np.fromfile(ckpt_path + "/model.final_layernorm.weight.bin", dtype=self.weights_data_type)).to(self.inference_data_type))
+        w.append(torch.from_numpy(np.fromfile(ckpt_path + "/model.final_layernorm.bias.bin", dtype=self.weights_data_type)).to(self.inference_data_type))
+        w.append(torch.from_numpy(np.fromfile(ckpt_path + "/model.lm_head.weight.bin", dtype=self.weights_data_type)).to(self.inference_data_type))
+
+        try:
+            for i in range(len(w)):
+                if w[i].nelement() > 0:
+                    self.w[i] = w[i].reshape(self.w[i].shape)
+                else:
+                    self.w[i] = w[i]
+
+        except RuntimeError:
+            raise RuntimeError(
+                f"head_num, size_per_head, vocab_size, and max_seq_len must be the same as the ones during training "
+                f"(idx: {i} expected shape: {self.w[i].shape} got shape: {w[i].shape})."
+            )
+
+        return True
+
+
+class GptNeoX(nn.Module):
+    def __init__(self,
+                 head_num, size_per_head,
+                 vocab_size, rotary_embedding_dim, 
+                 start_id, end_id, layer_num,
+                 max_seq_len,
+                 tensor_para_size, pipeline_para_size,
+                 use_gptj_residual,
+                 lib_path,
+                 inference_data_type: str = "fp16",
+                 weights_data_type: np.dtype = np.float32):
+        super().__init__()
+        self.head_num = head_num
+        self.size_per_head = size_per_head
+        self.inter_size = 4 * self.head_num * self.size_per_head
+        self.vocab_size = vocab_size
+        self.rotary_embedding_dim = rotary_embedding_dim
+        self.start_id = start_id
+        self.end_id = end_id
+        self.max_seq_len = max_seq_len
+        self.layer_num = layer_num
+        self.use_gptj_residual = use_gptj_residual
+
+        self.tensor_para_size = tensor_para_size
+        self.pipeline_para_size = pipeline_para_size
+        self.build_model = False
+        self.weights_data_type = weights_data_type
+        self.inference_data_type = inference_data_type
+
+        assert torch.cuda.is_available(), "CUDA is required for this model."
+
+        assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
+        assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
+
+        # Load the C++ model into Pytorch model.
+        torch.classes.load_library(os.path.abspath(lib_path))
+        
+        # Prepare weights
+        self.weights = GptNeoXWeights(head_num, size_per_head, layer_num, vocab_size,
+                                      max_seq_len, tensor_para_size, pipeline_para_size, use_gptj_residual,
+                                      weights_data_type=weights_data_type, inference_data_type=inference_data_type)
+        
+        # Prepare for tensor/pipeline parallel
+        try:
+            dist.init_process_group(backend='mpi')
+        except:
+            print("[INFO] WARNING: Have initialized the process group")
+        self.rank = dist.get_rank()
+        self.device_count = torch.cuda.device_count()
+        self.device = self.rank % self.device_count
+        torch.cuda.set_device(self.device)
+
+        world_size = dist.get_world_size()
+        # print(tensor_para_size * pipeline_para_size)
+        assert world_size == tensor_para_size * pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
+
+        self.tensor_para_rank = self.rank % self.tensor_para_size
+        self.pipeline_para_rank = self.rank // self.tensor_para_size
+
+        # Create and copy model to the device.
+        # self.cuda()
+
+    def load(self, ckpt_path):
+        is_load = self.weights.load(ckpt_path, tensor_para_rank=self.tensor_para_rank,
+                                    pipeline_para_rank=self.pipeline_para_rank)
+        self.cuda()
+        return is_load
+
+    def half(self):
+        self.weights._map(lambda w: w.half())
+        self.cuda()
+
+    def cuda(self):
+        self.weights._map(lambda w: w.cuda(self.device))
+
+        if self.build_model:
+            del self.model
+            self.build_model = False
+        
+        self.model = torch.classes.FasterTransformer.GptNeoXOp(self.head_num, self.size_per_head, self.inter_size,
+                                                               self.layer_num, self.vocab_size, self.rotary_embedding_dim, 
+                                                               self.start_id, self.end_id, self.tensor_para_size, self.pipeline_para_size,
+                                                               self.max_seq_len, self.use_gptj_residual, self.weights.w)
+
+        self.build_model = True
+
+    def forward(self,
+                start_ids: torch.Tensor,
+                start_lengths: torch.Tensor,
+                output_len,
+                beam_width = 1,
+                top_k: torch.Tensor = None,
+                top_p: torch.Tensor = None,
+                beam_search_diversity_rate: torch.Tensor = None,
+                temperature: torch.Tensor = None,
+                len_penalty: torch.Tensor = None,
+                repetition_penalty: torch.Tensor = None,
+                random_seed: torch.Tensor = None,
+                return_output_length = False,
+                return_cum_log_probs=0):
+        if not self.build_model:
+            self.cuda()
+        input_len = start_ids.size(1)
+        assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
+
+        # Inputs to device
+        input_ids = start_ids.cuda(self.device)
+        input_lengths = start_lengths.cuda(self.device)
+        # outputs: output_ids, output_lengths, output_cum_log_probs (optional)
+        outputs = self.model.forward(input_ids,
+                                     input_lengths,
+                                     output_len,
+                                     beam_width, # optional, can be None
+                                     top_k, # optional, can be None
+                                     top_p, # optional, can be None
+                                     beam_search_diversity_rate, # optional, can be None
+                                     temperature, # optional, can be None
+                                     len_penalty, # optional, can be None
+                                     repetition_penalty, # optional, can be None
+                                     random_seed, # optional, can be None
+                                     return_cum_log_probs) # optional, can be None
+
+        if return_cum_log_probs == 0:
+            output_ids, output_lengths = outputs
+        else:
+            output_ids, output_lengths, output_cum_log_probs = outputs
+        if return_output_length:
+            if return_cum_log_probs > 0:
+                return output_ids, output_lengths, output_cum_log_probs
+            else:
+                return output_ids, output_lengths
+        else:
+            return output_ids
+
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
diff --git a/examples/pytorch/gptneox/utils/huggingface_gptneox_convert.py b/examples/pytorch/gptneox/utils/huggingface_gptneox_convert.py
new file mode 100644
index 000000000..e652d9c89
--- /dev/null
+++ b/examples/pytorch/gptneox/utils/huggingface_gptneox_convert.py
@@ -0,0 +1,251 @@
+import argparse
+import configparser
+import multiprocessing
+import os
+import shutil
+from pathlib import Path
+
+import numpy as np
+import torch
+from transformers import GPTNeoXForCausalLM
+
+
+def get_weight_data_type(data_type):
+    if data_type == "fp32":
+        return np.float32
+    elif data_type == "fp16":
+        return np.float16
+    else:
+        assert False, f"Invalid weight data type {data_type}"
+
+
+def split_and_convert_process(saved_dir, factor, key, args, config, val):
+
+    if (
+        key.find("input_layernorm.weight") != -1
+        or key.find("input_layernorm.bias") != -1
+        or key.find("post_attention_layernorm.weight") != -1
+        or key.find("post_attention_layernorm.bias") != -1
+        or key.find("final_layernorm.weight") != -1
+        or key.find("final_layernorm.bias") != -1
+    ):
+        saved_path = saved_dir + f"/model.{key}.bin"
+        val.tofile(saved_path)
+
+    elif (
+        key.find("attention.dense.bias") != -1
+        or key.find("mlp.dense_4h_to_h.bias") != -1
+    ):
+        saved_path = saved_dir + f"/model.{key}.bin"
+        val = (val / factor) if factor > 1 else val
+        val.tofile(saved_path)
+
+    else:
+        if (
+            key.find("attention.dense.weight") != -1
+            or key.find("mlp.dense_4h_to_h.weight") != -1
+        ):
+            split_vals = np.split(val, factor, axis=0)
+
+        elif (
+            key.find("mlp.dense_h_to_4h.weight") != -1
+            or key.find("mlp.dense_h_to_4h.bias") != -1
+        ):
+            split_vals = np.split(val, factor, axis=-1)
+
+        elif key.find("attention.query_key_value.bias") != -1:
+            local_dim = (int)(val.shape[-1] / 3)
+            n_head = config["num_attention_heads"]
+
+            val = val.reshape(n_head, 3, local_dim // n_head)
+            val = np.transpose(val, [1, 0, 2]).reshape(3, local_dim)
+            split_vals = np.split(val, factor, axis=-1)
+
+        elif key.find("attention.query_key_value.weight") != -1:
+            hidden_dim = val.shape[0]
+            local_dim = (int)(val.shape[-1] / 3)
+            n_head = config["num_attention_heads"]
+            # Note that the HF qkv weight are stored as [hidden_size, num_heads, 3, head_hidden]
+            # FT needs the shape of [hidden_size, 3, num_heads, head_hidden]
+            val = val.reshape(hidden_dim, n_head, 3, local_dim // n_head)
+            val = np.transpose(val, [0, 2, 1, 3]).reshape(hidden_dim, 3, local_dim)
+
+            # print(np.mean(np.abs(val[:, 0, :])))
+            split_vals = np.split(val, factor, axis=-1)
+
+        else:
+            print("[ERROR] cannot find key '{}'".format(key))
+            return
+
+        for j in range(factor):
+            saved_path = saved_dir + f"/model.{key}.{j}.bin"
+            split_vals[j].tofile(saved_path)
+
+
+def split_and_convert(args):
+    saved_dir = args.saved_dir + "/%d-gpu/" % args.infer_gpu_num
+
+    if os.path.exists(saved_dir) == False:
+        os.makedirs(saved_dir)
+
+    factor = args.infer_gpu_num
+
+    # load position_embedding from rank 0
+    # model = torch.load(ckpt_name)
+    model = GPTNeoXForCausalLM.from_pretrained(args.in_file)
+    hf_config = vars(model.config)
+
+    np_weight_data_type = get_weight_data_type(args.weight_data_type)
+
+    try:
+        model_name = args.model_name
+        n_heads = hf_config["num_attention_heads"]
+        head_size = hf_config["hidden_size"] // n_heads
+        rotary_dim = int(head_size * hf_config["rotary_pct"])
+        use_gptj_residual = int(hf_config["use_parallel_residual"])
+
+        config = configparser.ConfigParser()
+        config["gptneox"] = {}
+        config["gptneox"]["model_name"] = model_name
+        config["gptneox"]["head_num"] = str(n_heads)
+        config["gptneox"]["size_per_head"] = str(head_size)
+        config["gptneox"]["inter_size"] = str(hf_config["intermediate_size"])
+        config["gptneox"]["num_layer"] = str(hf_config["num_hidden_layers"])
+        config["gptneox"]["rotary_embedding"] = str(rotary_dim)
+        config["gptneox"]["vocab_size"] = str(hf_config["vocab_size"])
+        config["gptneox"]["start_id"] = str(hf_config["bos_token_id"])
+        config["gptneox"]["end_id"] = str(hf_config["eos_token_id"])
+        config["gptneox"]["use_gptj_residual"] = str(use_gptj_residual)
+        config["gptneox"]["weight_data_type"] = args.weight_data_type
+
+        with open((Path(saved_dir) / f"config.ini").as_posix(), "w") as configfile:
+            config.write(configfile)
+    except Exception as e:
+        print(f"Fail to save the config in config.ini.", e)
+
+    ft_model_name_pattern = [
+        "input_layernorm.bias",
+        "input_layernorm.weight",
+        "attention.query_key_value.bias",
+        "attention.query_key_value.weight",
+        "attention.dense.bias",
+        "attention.dense.weight",
+        "post_attention_layernorm.bias",
+        "post_attention_layernorm.weight",
+        "mlp.dense_h_to_4h.bias",
+        "mlp.dense_h_to_4h.weight",
+        "mlp.dense_4h_to_h.bias",
+        "mlp.dense_4h_to_h.weight",
+    ]
+
+    huggingface_model_file_list = [__fn for __fn in os.listdir(args.in_file) if __fn.endswith(".bin")]
+    if len(huggingface_model_file_list) > 1:
+        multiprocessing_context = multiprocessing.get_context()
+        pool_fn = multiprocessing_context.Pool
+    else:
+        torch.multiprocessing.set_start_method("spawn")
+        pool_fn = multiprocessing.Pool
+
+    pool = pool_fn(args.processes)
+
+    for name, param in model.named_parameters():
+        array = param.detach().cpu().numpy().astype(np_weight_data_type)
+        # print("input shape", name, array.shape)
+        if name.find("weight") == -1 and name.find("bias") == -1:
+            print("skipped", name)
+            continue
+        elif name == "gpt_neox.embed_in.weight":
+            array.tofile(saved_dir + "model.wte.bin")
+        elif name == "gpt_neox.final_layer_norm.bias":
+            array.tofile(saved_dir + "model.final_layernorm.bias.bin")
+        elif name == "gpt_neox.final_layer_norm.weight":
+            array.tofile(saved_dir + "model.final_layernorm.weight.bin")
+        elif name == "embed_out.weight":
+            array.tofile(saved_dir + "model.lm_head.weight.bin")
+        else:
+            processed = False
+            for i in range(len(ft_model_name_pattern)):
+                if name.find(ft_model_name_pattern[i]) != -1:
+                    new_name = name.replace("gpt_neox.", "")
+                    pool.starmap(
+                        split_and_convert_process,
+                        [
+                            (
+                                saved_dir,
+                                factor,
+                                new_name,
+                                args,
+                                vars(model.config),
+                                array.T,
+                            )
+                        ],
+                    )
+                    processed = True
+                    break
+
+            if not processed:
+                print("Unused layer", name)
+
+    pool.close()
+    pool.join()
+
+    # Post-process biases if use_gptj_residual is True
+    if use_gptj_residual:
+        for layer_idx in range(hf_config["num_hidden_layers"]):
+            attn_bias = np.fromfile(
+                saved_dir + f"/model.layers.{layer_idx}.attention.dense.bias.bin",
+                dtype=np_weight_data_type,
+            )
+            mlp_bias = np.fromfile(
+                saved_dir + f"/model.layers.{layer_idx}.mlp.dense_4h_to_h.bias.bin",
+                dtype=np_weight_data_type,
+            )
+
+            (attn_bias + mlp_bias).astype(np_weight_data_type).tofile(
+                saved_dir + f"/model.layers.{layer_idx}.mlp.attention.bias.sum.bin"
+            )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "-saved_dir", "-o", type=str, help="file name of output file", required=True
+    )
+    parser.add_argument(
+        "-in_file",
+        "-i",
+        type=str,
+        help="file name of input checkpoint file",
+        required=True,
+    )
+    parser.add_argument(
+        "-infer_gpu_num",
+        "-i_g",
+        type=int,
+        help="How many gpus for inference",
+        required=True,
+    )
+    parser.add_argument(
+        "-processes",
+        "-p",
+        type=int,
+        help="How many processes to spawn for conversion (default: 4)",
+        default=4,
+    )
+    parser.add_argument(
+        "-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"]
+    )
+    parser.add_argument(
+        "-model_name", "-m_n", type=str, help="model name", required=True
+    )
+
+    args = parser.parse_args()
+    print("\n=============== Argument ===============")
+    for key in vars(args):
+        print("{}: {}".format(key, vars(args)[key]))
+    print("========================================")
+
+    __dir = os.path.join(args.saved_dir, "%d-gpu" % args.infer_gpu_num)
+    assert not os.path.exists(__dir), "target path has exist, please remove %s first." % __dir
+
+    split_and_convert(args)
\ No newline at end of file
diff --git a/src/fastertransformer/kernels/beam_search_topk_kernels.cu b/src/fastertransformer/kernels/beam_search_topk_kernels.cu
index fe4bcbeb4..fcaf644b0 100644
--- a/src/fastertransformer/kernels/beam_search_topk_kernels.cu
+++ b/src/fastertransformer/kernels/beam_search_topk_kernels.cu
@@ -804,10 +804,10 @@ __global__ void insertUnfinishedPath(BeamHypotheses beam_hyps,
             const int length = beam_hyps.sequence_lengths_src[src_beam_idx];
 
             beam_hyps.output_ids_tgt[(tgt_beam_idx) * (beam_hyps.max_seq_len + 1) + length] =
-                beam_hyps.output_ids_src[length * batch_size * beam_width + bid * beam_width + src_beam_idx];
+                beam_hyps.output_ids_src[length * batch_size * beam_width + src_beam_idx];
             if (beam_hyps.log_probs != nullptr && beam_hyps.log_probs_src != nullptr) {
                 beam_hyps.log_probs[(tgt_beam_idx) * (beam_hyps.max_seq_len + 1) + length] =
-                    beam_hyps.log_probs_src[length * batch_size * beam_width + bid * beam_width + src_beam_idx];
+                    beam_hyps.log_probs_src[length * batch_size * beam_width + src_beam_idx];
             }
             int prev_id = beam_hyps.parent_ids_src[length * batch_size * beam_width + src_beam_idx];
             for (int j = length - 1; j >= 0; j--) {
diff --git a/src/fastertransformer/kernels/custom_ar_kernels.cu b/src/fastertransformer/kernels/custom_ar_kernels.cu
index af8aee128..056ae375c 100644
--- a/src/fastertransformer/kernels/custom_ar_kernels.cu
+++ b/src/fastertransformer/kernels/custom_ar_kernels.cu
@@ -292,7 +292,7 @@ static __global__ void twoShotAllReduceKernel(AllReduceParams<T> params)
             // use round-robin gathering from other ranks
             int offset_rank = local_offset + (dst_rank[ii] - params.local_rank) * params.elts_per_rank;
             reinterpret_cast<PackedType*>(&params.local_output_buffer_ptr[offset_rank])[0] =
-                reinterpret_cast<PackedType*>(&src_d[dst_rank[ii]][offset_rank])[0];
+                reinterpret_cast<PackedType*>(&src_d[ii][offset_rank])[0];
         }
     }
 }
@@ -395,4 +395,4 @@ template void invokeOneOrTwoShotAllReduceKernel<__nv_bfloat16>(AllReduceParams<_
                                                                cudaStream_t                    stream);
 #endif
 template void invokeOneOrTwoShotAllReduceKernel<uint32_t>(AllReduceParams<uint32_t>& param, cudaStream_t stream);
-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention.cu b/src/fastertransformer/kernels/decoder_masked_multihead_attention.cu
index 4618673d8..8c6e682a4 100644
--- a/src/fastertransformer/kernels/decoder_masked_multihead_attention.cu
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention.cu
@@ -41,6 +41,9 @@ void multihead_attention_(const KERNEL_PARAMS_TYPE& params, const cudaStream_t&
         case 96:
             mmha_launch_kernel<T, 96, 128, KERNEL_PARAMS_TYPE>(params, stream);
             break;
+        case 112:
+            mmha_launch_kernel<T, 112, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
         case 128:
             mmha_launch_kernel<T, 128, 128, KERNEL_PARAMS_TYPE>(params, stream);
             break;
diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_112.cu b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_112.cu
new file mode 100644
index 000000000..3261791c3
--- /dev/null
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_112.cu
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "decoder_masked_multihead_attention_template.hpp"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include <assert.h>
+#include <float.h>
+#include <type_traits>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define MMHA_LAUNCH_KERNEL(                                                                                            \
+    T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, DO_CROSS_ATTENTION, HAS_BEAMS, stream)                \
+    size_t smem_sz = mmha::smem_size_in_bytes<T, DO_CROSS_ATTENTION>(params, THDS_PER_VALUE, THDS_PER_BLOCK);          \
+    dim3   grid(params.num_heads, params.batch_size);                                                                  \
+    mmha::masked_multihead_attention_kernel<T,                                                                         \
+                                            Dh,                                                                        \
+                                            Dh_MAX,                                                                    \
+                                            THDS_PER_KEY,                                                              \
+                                            THDS_PER_VALUE,                                                            \
+                                            THDS_PER_BLOCK,                                                            \
+                                            DO_CROSS_ATTENTION,                                                        \
+                                            HAS_BEAMS><<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// !!! Specialize the launcher for Cross attention
+template<typename T, int Dh, int Dh_MAX, typename KERNEL_PARAMS_TYPE>
+void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
+{
+    constexpr int  THREADS_PER_VALUE  = threads_per_value_t<T, Dh_MAX>::value;
+    constexpr bool DO_CROSS_ATTENTION = std::is_same<KERNEL_PARAMS_TYPE, Cross_multihead_attention_params<T>>::value;
+    int            tlength            = (DO_CROSS_ATTENTION) ? params.memory_max_len : params.timestep;
+    if (params.cache_indir == nullptr) {
+        if (tlength < 32) {
+            MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, DO_CROSS_ATTENTION, false, stream);
+        }
+        else if (tlength < 2048) {
+            MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, DO_CROSS_ATTENTION, false, stream);
+        }
+        else {
+            MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, DO_CROSS_ATTENTION, false, stream);
+        }
+    }
+    else {
+        if (tlength < 32) {
+            MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, DO_CROSS_ATTENTION, true, stream);
+        }
+        else if (tlength < 2048) {
+            MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, DO_CROSS_ATTENTION, true, stream);
+        }
+        else {
+            MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, DO_CROSS_ATTENTION, true, stream);
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template void mmha_launch_kernel<float, 112, 128, Masked_multihead_attention_params<float>>(
+    const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream);
+template void mmha_launch_kernel<uint16_t, 112, 128, Masked_multihead_attention_params<uint16_t>>(
+    const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+template void mmha_launch_kernel<__nv_bfloat16, 112, 128, Masked_multihead_attention_params<__nv_bfloat16>>(
+    const Masked_multihead_attention_params<__nv_bfloat16>& params, const cudaStream_t& stream);
+#endif
+#ifdef ENABLE_FP8
+template void mmha_launch_kernel<__nv_fp8_e4m3, 112, 128, Masked_multihead_attention_params<__nv_fp8_e4m3>>(
+    const Masked_multihead_attention_params<__nv_fp8_e4m3>& params, const cudaStream_t& stream);
+#endif
+
+template void mmha_launch_kernel<float, 112, 128, Cross_multihead_attention_params<float>>(
+    const Cross_multihead_attention_params<float>& params, const cudaStream_t& stream);
+template void mmha_launch_kernel<uint16_t, 112, 128, Cross_multihead_attention_params<uint16_t>>(
+    const Cross_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+template void mmha_launch_kernel<__nv_bfloat16, 112, 128, Cross_multihead_attention_params<__nv_bfloat16>>(
+    const Cross_multihead_attention_params<__nv_bfloat16>& params, const cudaStream_t& stream);
+#endif
+#ifdef ENABLE_FP8
+template void mmha_launch_kernel<__nv_fp8_e4m3, 112, 128, Cross_multihead_attention_params<__nv_fp8_e4m3>>(
+    const Cross_multihead_attention_params<__nv_fp8_e4m3>& params, const cudaStream_t& stream);
+#endif
+
+#undef MMHA_LAUNCH_KERNEL
diff --git a/src/fastertransformer/kernels/decoding_kernels.cu b/src/fastertransformer/kernels/decoding_kernels.cu
index ff28bae8b..89f0d5011 100644
--- a/src/fastertransformer/kernels/decoding_kernels.cu
+++ b/src/fastertransformer/kernels/decoding_kernels.cu
@@ -98,19 +98,19 @@ template void invokeDecodingInitialize(bool*          finished,
 
 // PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
 template<typename T>
-__global__ void embeddingLookupPosEncoding(T*         from_tensor,
-                                           const T*   embedding_table,
-                                           const T*   position_encoding,
-                                           const int* all_ids,
-                                           const int* padding_count,
-                                           const int* input_lengths,
-                                           const int  local_token_num,
-                                           const int  hidden_units,
-                                           const int  step,
-                                           const int  max_input_length,
-                                           const int  token_num,
-                                           const int  ite,
-                                           const T    scale)
+__global__ void embeddingLookupPosEncoding(T*             from_tensor,
+                                           const T*       embedding_table,
+                                           const T*       position_encoding,
+                                           const int*     all_ids,
+                                           const int*     padding_count,
+                                           const int*     input_lengths,
+                                           const int      local_token_num,
+                                           const int64_t  hidden_units,
+                                           const int      step,
+                                           const int      max_input_length,
+                                           const int      token_num,
+                                           const int      ite,
+                                           const T        scale)
 {
     // 1. lookup from embedding table
     // 2. multiply scale
@@ -120,7 +120,7 @@ __global__ void embeddingLookupPosEncoding(T*         from_tensor,
     const bool use_padding_count = padding_count != nullptr;
     const bool use_input_len     = input_lengths != nullptr;
 
-    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_token_num * hidden_units;
+    for (int64_t index = blockIdx.x * blockDim.x + threadIdx.x; index < local_token_num * hidden_units;
          index += blockDim.x * gridDim.x) {
         const int row_index   = index / hidden_units;
         const int col_index   = index % hidden_units;
@@ -148,7 +148,7 @@ __global__ void embeddingLookup(T*                    from_tensor,
                                 const int*            all_ids,
                                 pPromptTuningParam<T> prompt_param,
                                 const int             local_token_num,
-                                const int             hidden_units,
+                                const int64_t         hidden_units,
                                 const int             step,
                                 const int             token_num,
                                 const int             ite,
@@ -159,7 +159,7 @@ __global__ void embeddingLookup(T*                    from_tensor,
     // 2. multiply scale
     const int id_offset = step * token_num + ite * local_token_num;
 
-    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_token_num * hidden_units;
+    for (int64_t index = blockIdx.x * blockDim.x + threadIdx.x; index < local_token_num * hidden_units;
          index += blockDim.x * gridDim.x) {
 
         const int word_index     = index / hidden_units;
@@ -313,15 +313,15 @@ INSTANTIATE_LOOKUP_POS_ENCODING_PAD_COUNT(__nv_bfloat16);
 #undef INSTANTIATE_LOOKUP_POS_ENCODING_PAD_COUNT
 
 template<typename T>
-__global__ void paddingEmbedding(T*        padded_embedding_kernel,
-                                 T*        padded_embedding_bias,
-                                 const T*  embedding_kernel,
-                                 const T*  embedding_bias,
-                                 const int hidden_unit,
-                                 const int vocab_size,
-                                 const int vocab_size_padded)
+__global__ void paddingEmbedding(T*            padded_embedding_kernel,
+                                 T*            padded_embedding_bias,
+                                 const T*      embedding_kernel,
+                                 const T*      embedding_bias,
+                                 const int64_t hidden_unit,
+                                 const int64_t vocab_size,
+                                 const int64_t vocab_size_padded)
 {
-    for (int id = threadIdx.x + blockIdx.x * blockDim.x; id < hidden_unit * vocab_size_padded;
+    for (int64_t id = threadIdx.x + blockIdx.x * blockDim.x; id < hidden_unit * vocab_size_padded;
          id += blockDim.x * gridDim.x) {
         int row_id = id / vocab_size_padded;
         int col_id = id % vocab_size_padded;
diff --git a/src/fastertransformer/kernels/gpt_kernels.cu b/src/fastertransformer/kernels/gpt_kernels.cu
index abb3b5db4..7dc9af620 100644
--- a/src/fastertransformer/kernels/gpt_kernels.cu
+++ b/src/fastertransformer/kernels/gpt_kernels.cu
@@ -39,7 +39,7 @@ __global__ void start_id_embedding_position_lookups_kernel(T*
                                                            const int             length,
                                                            const int             max_length,
                                                            const int             batch_size,
-                                                           const int             hidden_units)
+                                                           const int64_t         hidden_units)
 {
     for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * length * hidden_units;
          index += blockDim.x * gridDim.x) {
@@ -250,20 +250,20 @@ __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLo
         const int beam_id   = tmp_index % param.beam_width;
         tmp_index           = (tmp_index - beam_id) / param.beam_width;
         const int batch_id  = tmp_index % param.batch_size;
+        const int64_t hidden_units = param.hidden_units;
         T         embedding =
             (seq_id < param.prefix_soft_prompt_lengths[batch_id]) ?
-                        (T)param
-                    .prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * param.hidden_units
-                                                  + seq_id * param.hidden_units + hidden_id] :
-                        param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length
+                        (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units
+                                                      + seq_id * hidden_units + hidden_id] :
+                            param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length
                                                       + beam_id * param.max_input_length
                                                       + (seq_id - param.prefix_soft_prompt_lengths[batch_id])]
-                                          * param.hidden_units
+                                          * hidden_units
                                       + hidden_id];
 
         T pos_embed              = param.pos_table == nullptr ?
                                        (T)0.0f :
-                                       param.pos_table[(param.start_step + seq_id - 1) * param.hidden_units + hidden_id];
+                                       param.pos_table[(param.start_step + seq_id - 1) * hidden_units + hidden_id];
         param.from_tensor[index] = embedding + pos_embed;
 
         if (seq_id == 0 && hidden_id == 0) {
@@ -640,6 +640,7 @@ __global__ void generate_dups_indices(int*         batch_to_compact,
                                       int*         compact_size,
                                       const int*   shared_contexts,
                                       const size_t batch_size,
+                                      const size_t beam_width,
                                       const size_t input_seq_len)
 {
     const int padded_batchsize = blockDim.x * ((batch_size + blockDim.x - 1) / blockDim.x);
@@ -649,20 +650,23 @@ __global__ void generate_dups_indices(int*         batch_to_compact,
     __shared__ int                                                                   scan_offset;
 
     int scan = 0;
-    for (int batch = threadIdx.x; batch < padded_batchsize; batch += blockDim.x) {
-        bool masked     = (batch >= batch_size);
-        bool first_iter = batch < blockDim.x;
+    for (int seq_idx = threadIdx.x; seq_idx < padded_batchsize; seq_idx += blockDim.x) {
+        bool masked     = (seq_idx >= batch_size);
+        bool first_iter = seq_idx < blockDim.x;
 
-        int is_first_occur = masked ? 0 : shared_contexts[batch] == batch;
+        int is_first_occur = masked ? 0 : shared_contexts[seq_idx] == seq_idx;
         BlockScan(temp_storage).ExclusiveSum(is_first_occur, scan);
 
         if (!masked && is_first_occur) {
             int compact_idx = scan + (first_iter ? 0 : scan_offset);
             // Context rep. writes initial index
-            batch_to_compact[batch]       = compact_idx;
-            compact_to_batch[compact_idx] = batch;
+            batch_to_compact[seq_idx * beam_width] = compact_idx;
+            // input ids are tiled in context part
+            compact_to_batch[compact_idx] = seq_idx * beam_width;
         }
 
+        __syncthreads();
+
         if (threadIdx.x == blockDim.x - 1) {
             scan_offset = scan + is_first_occur + (first_iter ? 0 : scan_offset);
         }
@@ -671,8 +675,15 @@ __global__ void generate_dups_indices(int*         batch_to_compact,
 
         if (!masked && !is_first_occur) {
             // Fill the rest of batch_to_compact based on what rep. wrote
-            const int src_idx       = batch_to_compact[shared_contexts[batch]];
-            batch_to_compact[batch] = src_idx;
+            const int src_idx                      = batch_to_compact[shared_contexts[seq_idx] * beam_width];
+            batch_to_compact[seq_idx * beam_width] = src_idx;
+        }
+
+        if (!masked) {
+            // set same compact idx for beams
+            for (int beam_id = 1; beam_id < beam_width; ++beam_id) {
+                batch_to_compact[seq_idx * beam_width + beam_id] = batch_to_compact[seq_idx * beam_width];
+            }
         }
     }
 
@@ -696,14 +707,17 @@ void invokeFindContextDups(int*         shared_contexts,
                            int*         compact_size,
                            const int*   input_ids,
                            const size_t batch_size,
+                           const size_t beam_width,
                            const size_t input_seq_len,
                            cudaStream_t stream)
 {
     dim3 block{512};
     dim3 grid{((int)batch_size + block.x - 1) / block.x};
+    // set shared_context[i] = i
     init_shared_contexts<<<grid, block, 0, stream>>>(shared_contexts, batch_size);
 
     grid = dim3{(unsigned int)(batch_size * (batch_size - 1)) / 2};
+    // set shared_contexts[i] = j, where j = min{k, such that input_ids[k] == input_ids[i]}
     if (input_seq_len <= 128) {
         block = 128;
         find_context_dups<128><<<grid, block, 0, stream>>>(shared_contexts, input_ids, batch_size, input_seq_len);
@@ -713,8 +727,10 @@ void invokeFindContextDups(int*         shared_contexts,
         find_context_dups<256><<<grid, block, 0, stream>>>(shared_contexts, input_ids, batch_size, input_seq_len);
     }
 
+    // set batch_to_compact[i] = j, where j is the position of input_ids[i] in the compact_batch
+    // set compact_to_batch[i] = j, where j is such that compact_to_batch[i] = input_ids[j]
     generate_dups_indices<<<1, DUPS_INDICES_BLOCK_SIZE, 0, stream>>>(
-        batch_to_compact, compact_to_batch, compact_size, shared_contexts, batch_size, input_seq_len);
+        batch_to_compact, compact_to_batch, compact_size, shared_contexts, batch_size, beam_width, input_seq_len);
 }
 
 template<typename T>
diff --git a/src/fastertransformer/kernels/gpt_kernels.h b/src/fastertransformer/kernels/gpt_kernels.h
index 617f9bc05..d78224e0a 100644
--- a/src/fastertransformer/kernels/gpt_kernels.h
+++ b/src/fastertransformer/kernels/gpt_kernels.h
@@ -127,6 +127,7 @@ void invokeFindContextDups(int*         shared_contexts,
                            int*         compact_size,
                            const int*   input_ids,
                            const size_t batch_size,
+                           const size_t beam_width,
                            const size_t input_seq_len,
                            cudaStream_t stream = 0);
 
diff --git a/src/fastertransformer/kernels/stop_criteria_kernels.cu b/src/fastertransformer/kernels/stop_criteria_kernels.cu
index 5d6611153..a8d4b98fa 100644
--- a/src/fastertransformer/kernels/stop_criteria_kernels.cu
+++ b/src/fastertransformer/kernels/stop_criteria_kernels.cu
@@ -150,7 +150,7 @@ void invokeLengthCriterion(bool*           finished,
 
     length_criterion<<<grid, block, 0, stream>>>(
         finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step);
-    while (((volatile size_t*)h_pinned_finished_sum_)[0] == -1) {};
+    while (((volatile int*)h_pinned_finished_sum_)[0] == -1) {};
     sync_check_cuda_error();
 
     *should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width;
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index 90e8b8029..d0fb0a197 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -268,23 +268,23 @@ __global__ void softmax_kernel(T*          attn_score,
     // attn_mask, [batch_size, q_length, k_length]
     // linear_bias_slopes, [num_heads]
 
-    const int bi = blockIdx.y;  // Batch index.
-    const int hi = blockIdx.z;  // Head index.
+    const int64_t bi = blockIdx.y;  // Batch index.
+    const int64_t hi = blockIdx.z;  // Head index.
 
     __shared__ float s_mean, s_max;
 
     const float linear_bias_slope = linear_bias_slopes != nullptr ? (float)linear_bias_slopes[hi] : 0.0f;
 
     // Loop along with Q dimension.
-    for (int qi = blockIdx.x; qi < q_length; qi += gridDim.x) {
+    for (int64_t qi = blockIdx.x; qi < q_length; qi += gridDim.x) {
 
         float data[ITEMS_PER_THREAD];
-        int   qk_offset;
+        int64_t   qk_offset;
         float local_max = -1e20f;
 
         // Loop along with K dimension.
-        for (int i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
-            int ki    = blockDim.x * i + threadIdx.x;  // Index of K dimension.
+        for (int64_t i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
+            int64_t ki    = blockDim.x * i + threadIdx.x;  // Index of K dimension.
             qk_offset = ((bi * head_num + hi) * q_length + qi) * k_length + ki;
 
             float qk_val  = static_cast<float>(qk[qk_offset]);
@@ -297,7 +297,7 @@ __global__ void softmax_kernel(T*          attn_score,
                 qk_bias += static_cast<float>(linear_bias_slope * (ki - qi));
             }
 
-            int   mask_offset = (bi * q_length + qi) * k_length + ki;
+            int64_t   mask_offset = (bi * q_length + qi) * k_length + ki;
             float mask_val    = static_cast<float>(ldg(&attn_mask[mask_offset]));
             qk_bias += (1.0f - mask_val) * -10000.0f;
 
@@ -312,7 +312,7 @@ __global__ void softmax_kernel(T*          attn_score,
         __syncthreads();
 
         float local_sum = 0;
-        for (int i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
+        for (int64_t i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
             data[i] = __expf(data[i] - s_max);
             local_sum += data[i];
         }
@@ -324,7 +324,7 @@ __global__ void softmax_kernel(T*          attn_score,
         }
         __syncthreads();
 
-        for (int i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
+        for (int64_t i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
             qk_offset             = ((bi * head_num + hi) * q_length + qi) * k_length + blockDim.x * i + threadIdx.x;
             attn_score[qk_offset] = (T)(data[i] * s_mean);
         }
@@ -602,6 +602,7 @@ __global__ void softmax_kernel_h2_v2(T*        attn_score,
 
 #define LAUNCH_MAKSED_SOFTMAX_(T_, ITEMS_PER_THREAD)                                                                   \
     block.x /= ITEMS_PER_THREAD;                                                                                       \
+    block.x = (block.x + 31) / 32 * 32;                                                                                \
     assert(block.x <= 1024);                                                                                           \
     if (is_half2) {                                                                                                    \
         if (grid.x % 4 == 0) {                                                                                         \
diff --git a/src/fastertransformer/layers/TensorParallelGeluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelGeluFfnLayer.cc
index 1dda95b6d..fb78d5b3d 100644
--- a/src/fastertransformer/layers/TensorParallelGeluFfnLayer.cc
+++ b/src/fastertransformer/layers/TensorParallelGeluFfnLayer.cc
@@ -45,6 +45,7 @@ void TensorParallelGeluFfnLayer<T>::forward(TensorMap*          output_tensors,
     if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) {
         use_custom_all_reduce_kernel =
             custom_all_reduce_comm_->swapInternalBuffer(&swap_tensors, token_num * hidden_units);
+        output_tensors->at("ffn_output").data = swap_tensors[0].data;
     }
 
     GeluFfnLayer<T>::forward(output_tensors, input_tensors, ffn_weights);
@@ -57,6 +58,7 @@ void TensorParallelGeluFfnLayer<T>::forward(TensorMap*          output_tensors,
         }
         else {
             custom_all_reduce_comm_->customAllReduce(token_num * hidden_units, GeluFfnLayer<T>::stream_);
+            output_tensors->at("ffn_output").data = swap_tensors[0].data;
         }
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/layers/TensorParallelReluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelReluFfnLayer.cc
index 29ac2846e..e8646c7d1 100644
--- a/src/fastertransformer/layers/TensorParallelReluFfnLayer.cc
+++ b/src/fastertransformer/layers/TensorParallelReluFfnLayer.cc
@@ -45,6 +45,7 @@ void TensorParallelReluFfnLayer<T>::forward(TensorMap*          output_tensors,
     if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) {
         use_custom_all_reduce_kernel =
             custom_all_reduce_comm_->swapInternalBuffer(&swap_tensors, token_num * hidden_units);
+        output_tensors->at("ffn_output").data = swap_tensors[0].data;
     }
 
     ReluFfnLayer<T>::forward(output_tensors, input_tensors, ffn_weights);
@@ -57,6 +58,7 @@ void TensorParallelReluFfnLayer<T>::forward(TensorMap*          output_tensors,
         }
         else {
             custom_all_reduce_comm_->customAllReduce(token_num * hidden_units, ReluFfnLayer<T>::stream_);
+            output_tensors->at("ffn_output").data = swap_tensors[0].data;
         }
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
index 25a2da86b..bfc781cc4 100644
--- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
+++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
@@ -44,6 +44,7 @@ void TensorParallelSiluFfnLayer<T>::forward(TensorMap*          output_tensors,
     if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) {
         use_custom_all_reduce_kernel =
             custom_all_reduce_comm_->swapInternalBuffer(&swap_tensors, token_num * hidden_units);
+        output_tensors->at("ffn_output").data = swap_tensors[0].data;
     }
 
     SiluFfnLayer<T>::forward(output_tensors, input_tensors, ffn_weights);
@@ -55,6 +56,7 @@ void TensorParallelSiluFfnLayer<T>::forward(TensorMap*          output_tensors,
         }
         else {
             custom_all_reduce_comm_->customAllReduce(token_num * hidden_units, SiluFfnLayer<T>::stream_);
+            output_tensors->at("ffn_output").data = swap_tensors[0].data;
         }
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/layers/attention_layers/DecoderCrossAttentionLayer.cu b/src/fastertransformer/layers/attention_layers/DecoderCrossAttentionLayer.cu
index 7d022c4f4..55c4d9071 100644
--- a/src/fastertransformer/layers/attention_layers/DecoderCrossAttentionLayer.cu
+++ b/src/fastertransformer/layers/attention_layers/DecoderCrossAttentionLayer.cu
@@ -796,8 +796,8 @@ DecoderCrossAttentionLayer<T>::DecoderCrossAttentionLayer(size_t           max_b
     q_scaling_(q_scaling)
 {
     FT_CHECK(size_per_head_ == 32 || size_per_head_ == 48 || size_per_head_ == 64 || size_per_head_ == 80
-             || size_per_head_ == 96 || size_per_head_ == 128 || size_per_head_ == 144 || size_per_head_ == 160
-             || size_per_head_ == 192 || size_per_head_ == 224 || size_per_head_ == 256);
+             || size_per_head_ == 96 || size_per_head_ == 112 || size_per_head_ == 128 || size_per_head_ == 144
+             || size_per_head_ == 160 || size_per_head_ == 192 || size_per_head_ == 224 || size_per_head_ == 256);
 }
 
 template<typename T>
@@ -1030,4 +1030,4 @@ template class DecoderCrossAttentionLayer<half>;
 template class DecoderCrossAttentionLayer<__nv_bfloat16>;
 #endif
 
-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc
index 7ff426128..44fed478b 100644
--- a/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc
@@ -278,8 +278,8 @@ DecoderSelfAttentionLayer<T>::DecoderSelfAttentionLayer(size_t           max_bat
     int8_mode_(int8_mode)
 {
     FT_CHECK(size_per_head_ == 32 || size_per_head_ == 48 || size_per_head_ == 64 || size_per_head_ == 80
-             || size_per_head_ == 96 || size_per_head_ == 128 || size_per_head_ == 144 || size_per_head_ == 160
-             || size_per_head_ == 192 || size_per_head_ == 224 || size_per_head_ == 256);
+             || size_per_head_ == 96 || size_per_head_ == 112 || size_per_head_ == 128 || size_per_head_ == 144
+             || size_per_head_ == 160 || size_per_head_ == 192 || size_per_head_ == 224 || size_per_head_ == 256);
     if (int8_mode_ == 1) {
         FT_CHECK_WITH_INFO(!(std::is_same<T, float>::value), "Weight only quant not supported for fp32.");
         weight_only_int8_fc_runner_ = std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>();
diff --git a/src/fastertransformer/layers/attention_layers/TensorParallelDecoderCrossAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/TensorParallelDecoderCrossAttentionLayer.cc
index 8f14ff14a..0672b7150 100644
--- a/src/fastertransformer/layers/attention_layers/TensorParallelDecoderCrossAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/TensorParallelDecoderCrossAttentionLayer.cc
@@ -104,11 +104,12 @@ void TensorParallelDecoderCrossAttentionLayer<T>::forward(TensorMap*
     //      value_cache [batch, head_num, max_seq_len, size_per_head]
 
     const size_t size = output_tensors->at("hidden_features").size();
+    std::vector<Tensor> reduce_tensor{output_tensors->at("hidden_features")};
 
     bool use_custom_all_reduce_kernel = false;
     if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) {
-        std::vector<Tensor> reduce_tensor{output_tensors->at("hidden_features")};
         use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&reduce_tensor, size);
+        output_tensors->at("hidden_features").data = reduce_tensor[0].data;
     }
 
     DecoderCrossAttentionLayer<T>::forward(output_tensors, input_tensors, attention_weights);
@@ -121,6 +122,7 @@ void TensorParallelDecoderCrossAttentionLayer<T>::forward(TensorMap*
         }
         else {
             custom_all_reduce_comm_->customAllReduce(size, DecoderCrossAttentionLayer<T>::stream_);
+            output_tensors->at("hidden_features").data = reduce_tensor[0].data;
         }
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.cc
index 4eb9159cf..fbb726e2d 100644
--- a/src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.cc
@@ -200,11 +200,12 @@ void TensorParallelDecoderSelfAttentionLayer<T>::forward(TensorMap*
     //      value_cache [batch, head_num, max_seq_len, size_per_head]
 
     const size_t size = output_tensors->at("hidden_features").size();
+    std::vector<Tensor> reduce_tensor{output_tensors->at("hidden_features")};
 
     bool use_custom_all_reduce_kernel = false;
     if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr && do_all_reduce_) {
-        std::vector<Tensor> reduce_tensor{output_tensors->at("hidden_features")};
         use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&reduce_tensor, size);
+        output_tensors->at("hidden_features").data = reduce_tensor[0].data;
     }
 
     DecoderSelfAttentionLayer<T>::forward(output_tensors, input_tensors, attention_weights);
@@ -217,6 +218,7 @@ void TensorParallelDecoderSelfAttentionLayer<T>::forward(TensorMap*
         }
         else {
             custom_all_reduce_comm_->customAllReduce(size, DecoderSelfAttentionLayer<T>::stream_);
+            output_tensors->at("hidden_features").data = reduce_tensor[0].data;
         }
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/layers/attention_layers/TensorParallelDisentangledAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/TensorParallelDisentangledAttentionLayer.cc
index dd66344a7..72840a960 100644
--- a/src/fastertransformer/layers/attention_layers/TensorParallelDisentangledAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/TensorParallelDisentangledAttentionLayer.cc
@@ -35,11 +35,12 @@ void TensorParallelDisentangledAttentionLayer<T>::forward(TensorMap*
     // For more information, please refer to DisentangledAttentionLayer
 
     const size_t size = output_tensors->at("hidden_features").size();
+    std::vector<Tensor> hidden_features_reduce = {output_tensors->at("hidden_features")};
 
     bool use_custom_all_reduce_kernel = false;
     if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) {
-        std::vector<Tensor> hidden_features_reduce = {output_tensors->at("hidden_features")};
         use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&hidden_features_reduce, size);
+        output_tensors->at("hidden_features").data = hidden_features_reduce[0].data;
     }
 
     DisentangledAttentionLayer<T>::forward(output_tensors, input_tensors, attention_weights);
@@ -52,6 +53,7 @@ void TensorParallelDisentangledAttentionLayer<T>::forward(TensorMap*
         }
         else {
             custom_all_reduce_comm_->customAllReduce(size, DisentangledAttentionLayer<T>::stream_);
+            output_tensors->at("hidden_features").data = hidden_features_reduce[0].data;
         }
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.cc
index 55cb5efd7..bb4140ea3 100644
--- a/src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.cc
@@ -35,11 +35,12 @@ void TensorParallelGptContextAttentionLayer<T>::forward(TensorMap*
     //      value_cache [batch, local_head_num, max_seq_len, size_per_head]
 
     const size_t size = output_tensors->at("hidden_features").size();
+    std::vector<Tensor> reduce_tensor{output_tensors->at("hidden_features")};
 
     bool use_custom_all_reduce_kernel = false;
     if (do_all_reduce_ && enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) {
-        std::vector<Tensor> reduce_tensor{output_tensors->at("hidden_features")};
         use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&reduce_tensor, size);
+        output_tensors->at("hidden_features").data = reduce_tensor[0].data;
     }
 
     GptContextAttentionLayer<T>::forward(output_tensors, input_tensors, attention_weights);
@@ -52,6 +53,7 @@ void TensorParallelGptContextAttentionLayer<T>::forward(TensorMap*
         }
         else {
             custom_all_reduce_comm_->customAllReduce(size, GptContextAttentionLayer<T>::stream_);
+            output_tensors->at("hidden_features").data = reduce_tensor[0].data;
         }
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/layers/attention_layers/TensorParallelUnfusedAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/TensorParallelUnfusedAttentionLayer.cc
index 0b7f2dc05..91cc6f416 100644
--- a/src/fastertransformer/layers/attention_layers/TensorParallelUnfusedAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/TensorParallelUnfusedAttentionLayer.cc
@@ -37,11 +37,12 @@ void TensorParallelUnfusedAttentionLayer<T>::forward(TensorMap*                o
     // For more information, please refer to UnfusedAttentionLayer
 
     const size_t size = output_tensors->at("hidden_features").size();
+    std::vector<Tensor> hidden_features_reduce = {output_tensors->at("hidden_features")};
 
     bool use_custom_all_reduce_kernel = false;
     if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) {
-        std::vector<Tensor> hidden_features_reduce = {output_tensors->at("hidden_features")};
         use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&hidden_features_reduce, size);
+        output_tensors->at("hidden_features").data = hidden_features_reduce[0].data;
     }
 
     UnfusedAttentionLayer<T>::forward(output_tensors, input_tensors, attention_weights);
@@ -53,6 +54,7 @@ void TensorParallelUnfusedAttentionLayer<T>::forward(TensorMap*                o
         }
         else {
             custom_all_reduce_comm_->customAllReduce(size, UnfusedAttentionLayer<T>::stream_);
+            output_tensors->at("hidden_features").data = hidden_features_reduce[0].data;
         }
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/models/bert/Bert.cc b/src/fastertransformer/models/bert/Bert.cc
index 9b51c89cc..320fa29cf 100644
--- a/src/fastertransformer/models/bert/Bert.cc
+++ b/src/fastertransformer/models/bert/Bert.cc
@@ -510,10 +510,11 @@ void Bert<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors, const
                       Tensor{MEMORY_GPU, data_type, std::vector<size_t>{h_token_num, hidden_units_}, attn_out_buf_}}});
 
                 bool use_custom_all_reduce_kernel = false;
+                std::vector<Tensor> hidden_features{attn_output_tensors.at("hidden_features")};
                 if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) {
-                    std::vector<Tensor> hidden_features{attn_output_tensors.at("hidden_features")};
                     use_custom_all_reduce_kernel =
                         custom_all_reduce_comm_->swapInternalBuffer(&hidden_features, h_token_num * hidden_units_);
+                    attn_output_tensors.at("hidden_features").data = hidden_features[0].data;
                 }
 
                 if (attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::FUSED_PADDED_MHA) {
@@ -535,6 +536,7 @@ void Bert<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors, const
                     }
                     else {
                         custom_all_reduce_comm_->customAllReduce(h_token_num * hidden_units_, stream_);
+                        attn_output_tensors.at("hidden_features").data = hidden_features[0].data;
                     }
                     sync_check_cuda_error();
                 }
diff --git a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h
index 3662256c1..2850da466 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h
+++ b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h
@@ -28,7 +28,7 @@ namespace fastertransformer {
 template<typename T>
 struct GptNeoXDecoderLayerWeight {
 public:
-    GptNeoXDecoderLayerWeight() = delete;
+    GptNeoXDecoderLayerWeight() = default;
     GptNeoXDecoderLayerWeight(const int  hidden_units,
                               const int  inter_size,
                               const int  tensor_para_size  = 1,
diff --git a/src/fastertransformer/models/gptneox/GptNeoXWeight.cc b/src/fastertransformer/models/gptneox/GptNeoXWeight.cc
index 0f052a3a3..26995f255 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXWeight.cc
+++ b/src/fastertransformer/models/gptneox/GptNeoXWeight.cc
@@ -278,6 +278,16 @@ void GptNeoXWeight<T>::loadModel(std::string dir_path)
     }
 }
 
+template<typename T>
+void GptNeoXWeight<T>::resizeLayer(const int num_layer)
+{
+    num_layer_ = num_layer;
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight<T>());
+    }
+}
+
 template<typename T>
 bool GptNeoXWeight<T>::isValidLayerParallelId(int l)
 {
diff --git a/src/fastertransformer/models/gptneox/GptNeoXWeight.h b/src/fastertransformer/models/gptneox/GptNeoXWeight.h
index 2a6b1764e..3e868854e 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXWeight.h
+++ b/src/fastertransformer/models/gptneox/GptNeoXWeight.h
@@ -47,6 +47,8 @@ struct GptNeoXWeight {
 
     void loadModel(std::string dir_path);
 
+    void resizeLayer(const int num_layer);
+
     std::vector<GptNeoXDecoderLayerWeight<T>*> decoder_layer_weights;
     const T*                                   pre_decoder_embedding_table = nullptr;
     // GPT-J does not use embedding table, but we leave the ptr such that
@@ -65,6 +67,11 @@ struct GptNeoXWeight {
     LayerNormWeight<T> post_decoder_layernorm;
     DenseWeight<T>     post_decoder_embedding;
 
+    inline void setMaxSeqLen(size_t max_seq_len)
+    {
+        max_seq_len_ = max_seq_len;
+    }
+
 private:
     void setWeightPtr();
     void mallocWeights();
diff --git a/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc b/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
index 2b9e4f3c4..93b80ae6e 100644
--- a/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
+++ b/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
@@ -101,7 +101,11 @@ void ParallelGpt<T>::allocateBuffer(size_t batch_size,
                                     bool   is_return_context_cum_log_probs)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    const size_t batchxbeam = batch_size * beam_width;
+    const size_t batchxbeam       = batch_size * beam_width;
+    const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_);
+    FT_CHECK(batch_size % local_batch_size == 0);
+    const size_t num_microbatches = batch_size / local_batch_size;
+
     const size_t self_cache_size =
         (num_layer_ / pipeline_para_.world_size_) * batchxbeam * memory_len * hidden_units_ / tensor_para_.world_size_;
 
@@ -111,8 +115,8 @@ void ParallelGpt<T>::allocateBuffer(size_t batch_size,
         padded_embedding_kernel_ptr_ = padded_embedding_kernel_;
     }
 
-    input_attention_mask_ = (T*)(allocator_->reMalloc(
-        input_attention_mask_, sizeof(T) * batchxbeam * max_input_len * max_input_len, false));
+    tiled_input_attention_mask_ = (T*)(allocator_->reMalloc(
+        tiled_input_attention_mask_, sizeof(T) * batchxbeam * max_input_len * max_input_len, false));
     decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
     decoder_normed_input_buf_ =
         (T*)(allocator_->reMalloc(decoder_normed_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
@@ -125,7 +129,6 @@ void ParallelGpt<T>::allocateBuffer(size_t batch_size,
         (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
     cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false));
     finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false));
-    h_finished_buf_   = new bool[batchxbeam];
     sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
@@ -154,7 +157,8 @@ void ParallelGpt<T>::allocateBuffer(size_t batch_size,
     output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_session_len, true));
     parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_session_len, true));
     seq_limit_len_  = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false));
-    masked_tokens_  = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * memory_len, true));
+    tiled_masked_tokens_ =
+        (bool*)(allocator_->reMalloc(tiled_masked_tokens_, sizeof(bool) * batchxbeam * memory_len, true));
 
     context_decoder_input_buf_  = (T*)(allocator_->reMalloc(
         context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
@@ -184,12 +188,13 @@ void ParallelGpt<T>::allocateBuffer(size_t batch_size,
         lp_logprob_buf_ = (float*)allocator_->reMalloc(lp_logprob_buf_, sizeof(float) * batchxbeam * max_input_len);
     }
     if (shared_contexts_ratio_ > 0.0f) {
-        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, 3 * batch_size * sizeof(int), false);
-        batch_to_compact_idx_ = shared_contexts_idx_ + batch_size;
-        compact_idx_          = shared_contexts_idx_ + 2 * batch_size;
+        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false);
+        batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false);
+        compact_idx_          = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false);
         compact_size_         = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false);
     }
-    generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true);
+    microbatch_should_stop_ =
+        (bool*)allocator_->reMalloc(microbatch_should_stop_, sizeof(bool) * num_microbatches, true, true);
     tiled_total_padding_count_ =
         (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false);
 
@@ -205,7 +210,7 @@ void ParallelGpt<T>::freeBuffer()
             allocator_->free((void**)(&padded_embedding_kernel_));
         }
 
-        allocator_->free((void**)(&input_attention_mask_));
+        allocator_->free((void**)(&tiled_input_attention_mask_));
         allocator_->free((void**)(&decoder_input_buf_));
         allocator_->free((void**)(&decoder_output_buf_));
         allocator_->free((void**)(&normed_decoder_output_buf_));
@@ -213,7 +218,6 @@ void ParallelGpt<T>::freeBuffer()
         allocator_->free((void**)(&nccl_logits_buf_));
         allocator_->free((void**)(&cum_log_probs_));
         allocator_->free((void**)(&finished_buf_));
-        delete[] h_finished_buf_;
         allocator_->free((void**)(&sequence_lengths_));
 
         allocator_->free((void**)(&key_cache_));
@@ -230,7 +234,7 @@ void ParallelGpt<T>::freeBuffer()
         allocator_->free((void**)(&transposed_output_ids_buf_));
         allocator_->free((void**)(&output_ids_buf_));
         allocator_->free((void**)(&parent_ids_buf_));
-        allocator_->free((void**)(&masked_tokens_));
+        allocator_->free((void**)(&tiled_masked_tokens_));
 
         allocator_->free((void**)(&seq_limit_len_));
 
@@ -254,7 +258,7 @@ void ParallelGpt<T>::freeBuffer()
         allocator_->free((void**)(&lp_nccl_logits_buf_));
         allocator_->free((void**)(&lp_logprob_buf_));
 
-        allocator_->free((void**)(&generation_should_stop_), true);
+        allocator_->free((void**)(&microbatch_should_stop_), true);
 
         if (shared_contexts_ratio_ > 0.0f) {
             allocator_->free((void**)(&shared_contexts_idx_));
@@ -416,6 +420,8 @@ void ParallelGpt<T>::computeContextCumLogProbs(float*                      cum_l
     const size_t batchxbeam      = batch_size * beam_width;
     const size_t n_hidden_states = batchxbeam * max_input_length;
 
+    const cudaDataType_t cublas_type = getCudaDataType<T>();
+
     if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
         // normed decoder output [batch_size * beam_width, max_input_length, hidden_units_]
         invokeGeneralLayerNorm(lp_normed_decoder_output_buf_,
@@ -439,10 +445,10 @@ void ParallelGpt<T>::computeContextCumLogProbs(float*                      cum_l
                                   hidden_units_,  // k
                                   &alpha,
                                   padded_embedding_kernel_ptr_,
-                                  sizeof(T) == 2 ? CUDA_R_16F : CUDA_R_32F,
+                                  cublas_type,
                                   hidden_units_,  // k
                                   lp_normed_decoder_output_buf_,
-                                  sizeof(T) == 2 ? CUDA_R_16F : CUDA_R_32F,
+                                  cublas_type,
                                   hidden_units_,  // k
                                   &beta,
                                   lp_logits_buf_,
@@ -464,10 +470,10 @@ void ParallelGpt<T>::computeContextCumLogProbs(float*                      cum_l
                                   hidden_units_,  // k
                                   &alpha,
                                   padded_embedding_kernel_ptr_ + tensor_para_.rank_ * local_vocab_size * hidden_units_,
-                                  sizeof(T) == 2 ? CUDA_R_16F : CUDA_R_32F,
+                                  cublas_type,
                                   hidden_units_,  // k
                                   lp_normed_decoder_output_buf_,
-                                  sizeof(T) == 2 ? CUDA_R_16F : CUDA_R_32F,
+                                  cublas_type,
                                   hidden_units_,  // k
                                   &beta,
                                   lp_nccl_logits_buf_ + tensor_para_.rank_ * n_hidden_states * local_vocab_size,
@@ -809,8 +815,9 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
         num_layer_ / pipeline_para_.world_size_, batch_size * beam_width, local_head_num_, memory_len, size_per_head_};
 
     {
-        PUSH_RANGE("dynamic decode setup");
         TensorMap input_map(*input_tensors);
+
+        PUSH_RANGE("dynamic decode setup");
         dynamic_decode_layer_->setup(batch_size, beam_width, &input_map);
         handleOptArg(&input_map, "start_id", start_ids_buf_, start_id_, batch_size);
         handleOptArg(&input_map, "end_id", end_ids_buf_, end_id_, batch_size);
@@ -858,7 +865,7 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
         PUSH_RANGE("initialize output and parent ids");
         cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * session_len, stream_);
         cudaMemsetAsync(parent_ids_buf_, 0, sizeof(int) * batch_size * beam_width * session_len, stream_);
-        cudaMemsetAsync(masked_tokens_, false, sizeof(bool) * batch_size * beam_width * memory_len, stream_);
+        cudaMemsetAsync(tiled_masked_tokens_, false, sizeof(bool) * batch_size * beam_width * memory_len, stream_);
         cudaMemsetAsync(tiled_total_padding_count_, 0, sizeof(int) * batch_size * beam_width, stream_);
         if (beam_width > 1) {
             cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * memory_len, stream_);
@@ -889,6 +896,7 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
                                   compact_size_,
                                   input_tensors->at("input_ids").getPtr<int>(),
                                   batch_size,
+                                  beam_width,
                                   max_input_length,
                                   stream_);
             cudaD2Hcpy(&compact_size, compact_size_, 1);
@@ -1028,7 +1036,7 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
                 POP_RANGE;
             }
             PUSH_RANGE("build decoder attention mask");
-            invokeBuildDecoderAttentionMask(input_attention_mask_,
+            invokeBuildDecoderAttentionMask(tiled_input_attention_mask_,
                                             tiled_input_lengths_buf_,
                                             nullptr,
                                             batch_size * beam_width,
@@ -1049,15 +1057,16 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
                   Tensor(MEMORY_GPU,
                          data_type,
                          {batch_size * beam_width, 1, (size_t)max_input_length, (size_t)max_input_length},
-                         input_attention_mask_)},
+                         tiled_input_attention_mask_)},
                  {"input_lengths",
                   Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_)}});
 
             if (use_shared_contexts) {
                 decoder_input_tensors.insert("compact_idx",
                                              Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_));
-                decoder_input_tensors.insert("batch_to_compact_idx",
-                                             Tensor(MEMORY_GPU, TYPE_INT32, {batch_size}, batch_to_compact_idx_));
+                decoder_input_tensors.insert(
+                    "batch_to_compact_idx",
+                    Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_));
             }
             if (gpt_variant_params_.use_attention_linear_bias) {
                 decoder_input_tensors.insert("linear_bias_slopes",
@@ -1169,7 +1178,7 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
     }
 
     PUSH_RANGE("mask padding tokens");
-    invokeMaskPaddingTokens(masked_tokens_,
+    invokeMaskPaddingTokens(tiled_masked_tokens_,
                             input_tensors->at("input_lengths").getPtr<int>(),
                             memory_len,
                             max_input_length,
@@ -1184,6 +1193,10 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
 
     const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_);
     FT_CHECK(batch_size % local_batch_size == 0);
+    const size_t iteration_num = batch_size / local_batch_size;
+    for (int microbatch = 0; microbatch < iteration_num; ++microbatch) {
+        microbatch_should_stop_[microbatch] = false;
+    }
 
     for (step_ = step_start; step_ < (int)gen_len; step_++) {
         // Loop body produces Nth token by embedding && encoding token (N-1)
@@ -1192,11 +1205,14 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
         const int  src_indir_idx    = (step_ - step_start) % 2;
         const int  tgt_indir_idx    = 1 - src_indir_idx;
 
-        const size_t iteration_num = batch_size / local_batch_size;
-        *generation_should_stop_   = !fill_caches_only;
+        bool generation_should_stop = !fill_caches_only;
 
         PUSH_RANGE(fmtstr("token_%d", step_ - step_start));
         for (uint ite = 0; ite < iteration_num; ++ite) {
+            // skip the finished microbatch in previous steps
+            if (microbatch_should_stop_[ite]) {
+                continue;
+            }
             const int id_offset               = ite * local_batch_size * beam_width;
             const int hidden_units_offset     = id_offset * hidden_units_;
             const int vocab_size_units_offset = id_offset * vocab_size_padded_;
@@ -1214,10 +1230,9 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
                            pipeline_para_,
                            stream_);
 
-                // receive updated generation_should_stop_ from last rank
-                if (ite == 0) {
-                    ftNcclRecv(generation_should_stop_, 1, pipeline_para_.world_size_ - 1, pipeline_para_, stream_);
-                }
+                // receive updated microbatch_should_stop_ from last rank
+                ftNcclRecv(microbatch_should_stop_ + ite, 1, pipeline_para_.world_size_ - 1, pipeline_para_, stream_);
+                generation_should_stop &= microbatch_should_stop_[ite];
 
                 // receive updated cache_indirections from last rank
                 if (beam_width > 1) {
@@ -1241,10 +1256,10 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
                 // throw errors when detected
                 ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
                 sync_check_cuda_error();
-
-                if (ite == 0 && *generation_should_stop_) {
-                    break;
-                }
+            }
+            // skip the microbatch for last step, which is updated by last rank
+            if (microbatch_should_stop_[ite]) {
+                continue;
             }
 
             if ((max_input_length <= 1) || (step_ > step_start) || continue_gen) {
@@ -1302,7 +1317,7 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
                       Tensor(MEMORY_GPU,
                              TYPE_BOOL,
                              {local_batch_size * beam_width, memory_len},
-                             masked_tokens_ + id_offset * memory_len)}});
+                             tiled_masked_tokens_ + id_offset * memory_len)}});
                 if (beam_width > 1) {
                     decoder_input_tensors.insert({"cache_indirection",
                                                   Tensor(MEMORY_GPU,
@@ -1403,7 +1418,7 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
                                           CUDA_R_32F,
                                           cublasGemmAlgo_t(-1));
                     POP_RANGE;
-                    PUSH_RANGE("logits all reduce sum");
+                    PUSH_RANGE("logits all gather");
                     ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
                                     nccl_logits_buf_ + vocab_size_units_offset,
                                     local_batch_size * beam_width * local_vocab_size,
@@ -1484,9 +1499,14 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
 
                 PUSH_RANGE("result sampling and stop check");
                 dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
-                *generation_should_stop_ &= subbatch_should_stop;
+                generation_should_stop &= subbatch_should_stop;
+                microbatch_should_stop_[ite] = subbatch_should_stop;
                 POP_RANGE;
             }
+            else {
+                // for other ranks, they cannot update generation_should_stop by DynamicDecode, set to false directly;
+                generation_should_stop &= microbatch_should_stop_[ite];
+            }
 
             PUSH_RANGE("result communication");
             // send results to other rank
@@ -1504,10 +1524,8 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
                     ftNcclSend(
                         sequence_lengths_ + id_offset, local_batch_size * beam_width, i, pipeline_para_, stream_);
 
-                    // send updated generation_should_stop_
-                    if (ite == 0) {
-                        ftNcclSend(generation_should_stop_, 1, i, pipeline_para_, stream_);
-                    }
+                    // send updated microbatch_should_stop_
+                    ftNcclSend(microbatch_should_stop_ + ite, 1, i, pipeline_para_, stream_);
 
                     // send updated cache_indirections
                     if (beam_width > 1) {
@@ -1547,13 +1565,20 @@ void ParallelGpt<T>::forward(std::unordered_map<std::string, Tensor>*       outp
         if (step_ == initial_step + max_input_length) {
             /* We have just finished processing input: update the padding count:
              * total_padding_count += (max_input_length - input_lengths) */
+            PUSH_RANGE("Update padding count");
             invokeUpdatePaddingCount(tiled_total_padding_count_,
                                      input_tensors->at("input_lengths").getPtr<int>(),
                                      max_input_length,
                                      batch_size,
                                      beam_width,
                                      stream_);
+            POP_RANGE;
+        }
+
+        if (generation_should_stop) {
+            break;
         }
+
         POP_RANGE;
     }
     PUSH_RANGE("communicate tensors");
@@ -1605,6 +1630,7 @@ void ParallelGpt<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*
                                       const size_t                                   max_context_len,
                                       const size_t                                   max_input_without_prompt_length)
 {
+    PUSH_RANGE("Resolve output tensors");
     if (pipeline_para_.rank_ != pipeline_para_.world_size_ - 1) {
         return;
     }
@@ -1706,6 +1732,7 @@ void ParallelGpt<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*
         cudaD2Dcpy(
             output_tensors->at("is_finished").getPtr<bool>(), finished_buf_, output_tensors->at("is_finished").size());
     }
+    POP_RANGE;
 }
 
 template<typename T>
diff --git a/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h b/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
index 39b6bab5e..ea24de2d3 100644
--- a/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
+++ b/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
@@ -116,7 +116,7 @@ class ParallelGpt: public BaseLayer {
     T*       padded_embedding_kernel_;
     const T* padded_embedding_kernel_ptr_;
 
-    T* input_attention_mask_;
+    T* tiled_input_attention_mask_;
 
     T*        decoder_input_buf_;
     T*        decoder_normed_input_buf_ = nullptr;
@@ -126,10 +126,9 @@ class ParallelGpt: public BaseLayer {
     float*    nccl_logits_buf_;
     float*    cum_log_probs_;
     bool*     finished_buf_;
-    bool*     h_finished_buf_;
     int*      sequence_lengths_       = nullptr;
     uint32_t* seq_limit_len_          = nullptr;
-    bool*     generation_should_stop_ = nullptr;
+    bool*     microbatch_should_stop_ = nullptr;
 
     int* shared_contexts_idx_      = nullptr;
     T*   compact_decoder_features_ = nullptr;
@@ -154,7 +153,7 @@ class ParallelGpt: public BaseLayer {
     int*  transposed_output_ids_buf_;
     int*  output_ids_buf_;
     int*  parent_ids_buf_;
-    bool* masked_tokens_ = nullptr;
+    bool* tiled_masked_tokens_ = nullptr;
 
     T*     context_decoder_input_buf_;
     T*     context_decoder_normed_input_buf_;
diff --git a/src/fastertransformer/models/t5/T5DecodingWeight.cc b/src/fastertransformer/models/t5/T5DecodingWeight.cc
index 09a657fe0..e99f7145f 100644
--- a/src/fastertransformer/models/t5/T5DecodingWeight.cc
+++ b/src/fastertransformer/models/t5/T5DecodingWeight.cc
@@ -269,9 +269,11 @@ void T5DecodingWeight<T>::loadModel(std::string dir_path)
     }
 
     if (t5_with_bias) {
-        loadWeightFromBin<T>(
-            weights_ptr[4], {(size_t)weights_size[4]}, dir_path + "/decoder.final_layer_norm.bias.bin");
-        loadWeightFromBin<T>(weights_ptr[5], {(size_t)weights_size[5]}, dir_path + "/shared.bias.bin");
+        loadWeightFromBin<T>(weights_ptr[4],
+                             {(size_t)weights_size[4]},
+                             dir_path + "/decoder.final_layer_norm.bias.bin",
+                             model_file_type);
+        loadWeightFromBin<T>(weights_ptr[5], {(size_t)weights_size[5]}, dir_path + "/shared.bias.bin", model_file_type);
     }
 
     for (int l = 0; l < num_layer_; l++) {
diff --git a/src/fastertransformer/th_op/CMakeLists.txt b/src/fastertransformer/th_op/CMakeLists.txt
index ddd6be058..b9f2b9151 100644
--- a/src/fastertransformer/th_op/CMakeLists.txt
+++ b/src/fastertransformer/th_op/CMakeLists.txt
@@ -27,6 +27,7 @@ add_subdirectory(longformer)
 add_subdirectory(swin)
 add_subdirectory(vit)
 add_subdirectory(multi_gpu_gpt)
+add_subdirectory(gptneox)
 add_subdirectory(t5)
 add_subdirectory(bart)
 add_subdirectory(bert)
@@ -43,6 +44,7 @@ add_library(th_transformer SHARED
             $<TARGET_OBJECTS:th_gather_tree>
             $<TARGET_OBJECTS:th_longformer>
             $<TARGET_OBJECTS:th_parallel_gpt>
+            $<TARGET_OBJECTS:th_gptneox>
             $<TARGET_OBJECTS:th_swintransformer>
             $<TARGET_OBJECTS:th_t5>
             $<TARGET_OBJECTS:th_utils>
@@ -59,6 +61,7 @@ target_link_libraries(th_transformer PUBLIC "${TORCH_LIBRARIES}"
                       th_gather_tree
                       th_longformer
                       th_parallel_gpt
+                      th_gptneox
                       th_swintransformer
                       th_t5
                       th_utils
diff --git a/src/fastertransformer/th_op/common/GptOps.cc b/src/fastertransformer/th_op/common/GptOps.cc
index ea3a86887..fbb018085 100644
--- a/src/fastertransformer/th_op/common/GptOps.cc
+++ b/src/fastertransformer/th_op/common/GptOps.cc
@@ -48,6 +48,7 @@ std::vector<Tensor> find_context_duplications(Tensor input_ids)
                               get_ptr<int>(compact_size_tensor),
                               get_ptr<const int>(input_ids),
                               batch_size,
+                              1,
                               seq_len,
                               stream);
 
diff --git a/src/fastertransformer/th_op/gptneox/CMakeLists.txt b/src/fastertransformer/th_op/gptneox/CMakeLists.txt
new file mode 100755
index 000000000..dcebaa80c
--- /dev/null
+++ b/src/fastertransformer/th_op/gptneox/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(th_gptneox STATIC GptNeoXOp.cc)
+set_property(TARGET th_gptneox PROPERTY POSITION_INDEPENDENT_CODE  ON)
+target_link_libraries(th_gptneox PRIVATE "${TORCH_LIBRARIES}" GptNeoX th_utils nccl_utils)
diff --git a/src/fastertransformer/th_op/gptneox/GptNeoXOp.cc b/src/fastertransformer/th_op/gptneox/GptNeoXOp.cc
new file mode 100755
index 000000000..09e09c8e0
--- /dev/null
+++ b/src/fastertransformer/th_op/gptneox/GptNeoXOp.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/th_op/gptneox/GptNeoXOp.h"
+
+namespace th = torch;
+namespace ft = fastertransformer;
+namespace torch_ext {
+
+GptNeoXOp::GptNeoXOp(const int64_t            head_num,
+                     const int64_t            size_per_head,
+                     const int64_t            inter_size,
+                     const int64_t            layer_num,
+                     const int64_t            vocab_size,
+                     const int64_t            rotary_embedding_dim,
+                     const int64_t            start_id,
+                     const int64_t            end_id,
+                     const int64_t            tensor_para_size,
+                     const int64_t            pipeline_para_size,
+                     const int64_t            max_seq_len,
+                     const bool               use_gptj_residual,
+                     const vector<th::Tensor> weights):
+    st_(weights[0].scalar_type())
+{
+    for (auto t : weights) {
+        CHECK_INPUT(t, st_);
+    }
+
+    switch (st_) {
+        case at::ScalarType::Float:
+            ftgpt = new FTGptNeoX<float>((size_t)head_num,
+                                         (size_t)size_per_head,
+                                         (size_t)inter_size,
+                                         (size_t)layer_num,
+                                         (size_t)vocab_size,
+                                         (size_t)rotary_embedding_dim,
+                                         start_id,
+                                         end_id,
+                                         tensor_para_size,
+                                         pipeline_para_size,
+                                         (size_t)max_seq_len,
+                                         use_gptj_residual,
+                                         weights);
+            break;
+        case at::ScalarType::Half:
+            ftgpt = new FTGptNeoX<half>((size_t)head_num,
+                                        (size_t)size_per_head,
+                                        (size_t)inter_size,
+                                        (size_t)layer_num,
+                                        (size_t)vocab_size,
+                                        (size_t)rotary_embedding_dim,
+                                        start_id,
+                                        end_id,
+                                        tensor_para_size,
+                                        pipeline_para_size,
+                                        (size_t)max_seq_len,
+                                        use_gptj_residual,
+                                        weights);
+            break;
+        default:
+            throw std::runtime_error("Wrong Tensor type.");
+    }
+}
+
+GptNeoXOp::~GptNeoXOp()
+{
+    delete ftgpt;
+}
+
+std::vector<th::Tensor> GptNeoXOp::forward(th::Tensor               input_ids,
+                                           th::Tensor               input_lengths,
+                                           const int64_t            output_len,
+                                           th::optional<int64_t>    beam_width_opt,
+                                           th::optional<th::Tensor> top_k_opt,
+                                           th::optional<th::Tensor> top_p_opt,
+                                           th::optional<th::Tensor> beam_search_diversity_rate_opt,
+                                           th::optional<th::Tensor> temperature_opt,
+                                           th::optional<th::Tensor> len_penalty_opt,
+                                           th::optional<th::Tensor> repetition_penalty_opt,
+                                           th::optional<th::Tensor> random_seed_opt,
+                                           th::optional<int64_t>    return_cum_log_probs_opt)
+{
+    CHECK_TH_CUDA(input_ids);
+    CHECK_CONTIGUOUS(input_ids);
+    TORCH_CHECK(input_ids.dtype() == torch::kInt32, "input_ids dtype should be int32");
+    CHECK_TH_CUDA(input_lengths);
+    CHECK_CONTIGUOUS(input_lengths);
+    TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32");
+    int64_t return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int64_t)return_cum_log_probs_opt.value() : 0;
+    if (return_cum_log_probs_opt.has_value()) {
+        TORCH_CHECK(return_cum_log_probs == 0 || return_cum_log_probs == 1,
+                    "return_cum_log_probs should be"
+                    " 0 (no return cum_log_probs), "
+                    " 1 (the cumulative log probs of generated sequences)")
+    }
+
+    const int beam_width = beam_width_opt.has_value() ? (int)beam_width_opt.value() : 1;
+
+    const int  batch_size               = input_ids.size(0);
+    const int  max_input_length         = input_ids.size(1);
+    const int  total_request_output_len = max_input_length + output_len;
+    th::Tensor output_ids               = torch::empty({batch_size, beam_width, total_request_output_len},
+                                         torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false));
+    th::Tensor sequence_lengths =
+        torch::empty({batch_size, beam_width}, torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false));
+    th::Tensor cum_log_probs =
+        torch::empty({batch_size, beam_width}, torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false));
+
+    ftgpt->forward(input_ids,
+                   input_lengths,
+                   output_ids,
+                   sequence_lengths,
+                   cum_log_probs,
+                   (const size_t)output_len,
+                   (const size_t)beam_width,
+                   top_k_opt,
+                   top_p_opt,
+                   beam_search_diversity_rate_opt,
+                   temperature_opt,
+                   len_penalty_opt,
+                   repetition_penalty_opt,
+                   random_seed_opt,
+                   return_cum_log_probs_opt);
+    if (return_cum_log_probs > 0) {
+        return std::vector<th::Tensor>{output_ids, sequence_lengths, cum_log_probs};
+    }
+    return std::vector<th::Tensor>{output_ids, sequence_lengths};
+}
+
+}  // namespace torch_ext
+
+static auto fasterTransformerGptTHS =
+#ifdef LEGACY_THS
+    torch::jit::class_<torch_ext::GptNeoXOp>("FasterTransformerGptNeoXOp")
+#else
+    torch::jit::class_<torch_ext::GptNeoXOp>("FasterTransformer", "GptNeoXOp")
+#endif
+        .def(torch::jit::init<int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              bool,
+                              std::vector<th::Tensor>>())
+        .def("forward", &torch_ext::GptNeoXOp::forward);
diff --git a/src/fastertransformer/th_op/gptneox/GptNeoXOp.h b/src/fastertransformer/th_op/gptneox/GptNeoXOp.h
new file mode 100755
index 000000000..222fdd409
--- /dev/null
+++ b/src/fastertransformer/th_op/gptneox/GptNeoXOp.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/gptneox/GptNeoX.h"
+#include "src/fastertransformer/th_op/th_utils.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace ft = fastertransformer;
+namespace th = torch;
+namespace torch_ext {
+
+using std::vector;
+
+class IFGptNeoX {
+public:
+    virtual ~IFGptNeoX() {}
+    virtual void forward(th::Tensor&              input_ids,
+                         th::Tensor&              input_lengths,
+                         th::Tensor&              output_ids,
+                         th::Tensor&              sequence_lengths,
+                         th::Tensor&              cum_log_probs,
+                         const size_t             request_output_len,
+                         const size_t             beam_width,
+                         th::optional<th::Tensor> top_k_opt,
+                         th::optional<th::Tensor> top_p_opt,
+                         th::optional<th::Tensor> beam_search_diversity_rate_opt,
+                         th::optional<th::Tensor> temperature_opt,
+                         th::optional<th::Tensor> len_penalty_opt,
+                         th::optional<th::Tensor> repetition_penalty_opt,
+                         th::optional<th::Tensor> random_seed_opt,
+                         th::optional<int64_t>    return_cum_log_probs_opt) = 0;
+};
+
+template<typename T>
+class FTGptNeoX: public IFGptNeoX {
+public:
+    FTGptNeoX(const size_t             head_num,
+              const size_t             size_per_head,
+              const size_t             inter_size,
+              const size_t             layer_num,
+              const size_t             vocab_size,
+              const size_t             rotary_embedding_dim,
+              const int                start_id,
+              const int                end_id,
+              const int64_t            tensor_para_size,
+              const int64_t            pipeline_para_size,
+              const size_t             max_seq_len,
+              const bool               use_gptj_residual,
+              const vector<th::Tensor> weights):
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        inter_size_(inter_size),
+        layer_num_(layer_num),
+        vocab_size_(vocab_size),
+        rotary_embedding_dim_(rotary_embedding_dim),
+        start_id_(start_id),
+        end_id_(end_id),
+        use_gptj_residual_(use_gptj_residual),
+        weights_(weights),
+        tensor_para_size_(tensor_para_size),
+        pipeline_para_size_(pipeline_para_size)
+    {
+        ft::check_cuda_error(cublasLtCreate(&cublasltHandle_));
+        cublas_algo_map_      = new ft::cublasAlgoMap(GEMM_CONFIG, "");
+        cublas_wrapper_mutex_ = new std::mutex();
+
+        ftNcclInitialize(tensor_para_, pipeline_para_, tensor_para_size, pipeline_para_size);
+
+        gpt_weights_.resizeLayer(layer_num_);
+        for (int i = 0; i < (int)layer_num_; i++) {
+            gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta =
+                get_ptr<T>(weights_[i + 0 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma =
+                get_ptr<T>(weights_[i + 1 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel =
+                get_ptr<T>(weights_[i + 2 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias =
+                get_ptr<T>(weights_[i + 3 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel =
+                get_ptr<T>(weights_[i + 4 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias =
+                get_ptr<T>(weights_[i + 5 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel =
+                get_ptr<T>(weights_[i + 6 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias =
+                get_ptr<T>(weights_[i + 7 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel =
+                get_ptr<T>(weights_[i + 8 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias =
+                get_ptr<T>(weights_[i + 9 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta =
+                get_ptr<T>(weights_[i + 10 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma =
+                get_ptr<T>(weights_[i + 11 * layer_num_]);
+        }
+
+        gpt_weights_.pre_decoder_embedding_table   = get_ptr<T>(weights_[12 * layer_num_ + 0]);
+        gpt_weights_.post_decoder_layernorm.gamma  = get_ptr<T>(weights_[12 * layer_num_ + 1]);
+        gpt_weights_.post_decoder_layernorm.beta   = get_ptr<T>(weights_[12 * layer_num_ + 2]);
+        gpt_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[12 * layer_num_ + 3]);
+
+        gpt_weights_.setMaxSeqLen(max_seq_len);
+
+        ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
+    }
+
+    ~FTGptNeoX() override
+    {
+        ft::ftNcclParamDestroy(tensor_para_);
+        ft::ftNcclParamDestroy(pipeline_para_);
+        cublasLtDestroy(cublasltHandle_);
+        delete cublas_algo_map_;
+        delete cublas_wrapper_mutex_;
+    }
+
+    void forward(th::Tensor&              input_ids,
+                 th::Tensor&              input_lengths,
+                 th::Tensor&              output_ids,
+                 th::Tensor&              sequence_lengths,
+                 th::Tensor&              cum_log_probs,
+                 const size_t             request_output_len,
+                 const size_t             beam_width,
+                 th::optional<th::Tensor> top_k_opt,
+                 th::optional<th::Tensor> top_p_opt,
+                 th::optional<th::Tensor> beam_search_diversity_rate_opt,
+                 th::optional<th::Tensor> temperature_opt,
+                 th::optional<th::Tensor> len_penalty_opt,
+                 th::optional<th::Tensor> repetition_penalty_opt,
+                 th::optional<th::Tensor> random_seed_opt,
+                 th::optional<int64_t>    return_cum_log_probs_opt) override
+    {
+        int return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int)return_cum_log_probs_opt.value() : 0;
+
+        auto           stream       = at::cuda::getCurrentCUDAStream().stream();
+        cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();
+        cublasSetStream(cublasHandle, stream);
+        ft::Allocator<ft::AllocatorType::TH> allocator      = ft::Allocator<ft::AllocatorType::TH>();
+        ft::cublasMMWrapper                  cublas_wrapper = ft::cublasMMWrapper(
+            cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, &allocator);
+
+        if (std::is_same<T, half>::value) {
+            cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+        }
+        else if (std::is_same<T, float>::value) {
+            cublas_wrapper.setFP32GemmConfig();
+        }
+
+        const size_t request_batch_size = (size_t)input_ids.size(0);
+        const size_t max_input_length   = (size_t)input_ids.size(1);
+        const int    total_output_len   = (int)(max_input_length + request_output_len);
+
+        ft::AttentionType attention_type = ft::getAttentionType<T>(size_per_head_,
+                                                                   ft::getSMVersion(),
+                                                                   true,   // remove_padding
+                                                                   0,      // gpt supports any-seq-length fmha
+                                                                   true,   // is_fuse
+                                                                   false,  // with_relative_position_bias
+                                                                   true);  // causal_mask
+
+        ft::GptNeoX<T> gpt = ft::GptNeoX<T>(head_num_,
+                                            size_per_head_,
+                                            inter_size_,
+                                            layer_num_,
+                                            vocab_size_,
+                                            rotary_embedding_dim_,
+                                            start_id_,
+                                            end_id_,
+                                            end_id_ + 1,  // p/prompt tuning virtual token start id
+                                            ft::PromptLearningType::no_prompt,
+                                            use_gptj_residual_,
+                                            0.0f,  // beam_search_diversity_rate,
+                                            1,     // top_k,
+                                            0.0,   // top_p,
+                                            0,     // random_seed,
+                                            1.0f,  // temperature,
+                                            1.0f,  // len_penalty,
+                                            1.0f,  // repetition_penalty,
+                                            tensor_para_,
+                                            pipeline_para_,
+                                            stream,
+                                            &cublas_wrapper,
+                                            &allocator,
+                                            false,           // is_free_buffer_after_forward
+                                            &prop_,          // cuda_device_prop
+                                            attention_type,  // attention_type
+                                            nullptr,         // custom_all_reduce_comm
+                                            0);              // enable_custom_all_reduce
+
+        std::vector<uint32_t> output_seq_len(request_batch_size, total_output_len);
+
+        std::unordered_map<std::string, ft::Tensor> input_tensors = std::unordered_map<std::string, ft::Tensor>{
+            {"input_ids",
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_INT32,
+                        std::vector<size_t>{request_batch_size, max_input_length},
+                        get_ptr<int>(input_ids)}},
+            {"input_lengths",
+             ft::Tensor{
+                 ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{request_batch_size}, get_ptr<int>(input_lengths)}},
+            {"output_seq_len",
+             ft::Tensor{
+                 ft::MEMORY_CPU, ft::TYPE_UINT32, std::vector<size_t>{request_batch_size}, output_seq_len.data()}}};
+        if (beam_width > 1 && beam_search_diversity_rate_opt.has_value()) {
+            input_tensors.insert(
+                {"beam_search_diversity_rate",
+                 convert_tensor<float>(beam_search_diversity_rate_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (top_p_opt.has_value()) {
+            input_tensors.insert(
+                {"runtime_top_p", convert_tensor<float>(top_p_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (top_k_opt.has_value()) {
+            input_tensors.insert(
+                {"runtime_top_k", convert_tensor<uint>(top_k_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (temperature_opt.has_value()) {
+            input_tensors.insert(
+                {"temperature", convert_tensor<float>(temperature_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (len_penalty_opt.has_value()) {
+            input_tensors.insert(
+                {"len_penalty", convert_tensor<float>(len_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (repetition_penalty_opt.has_value()) {
+            input_tensors.insert({"repetition_penalty",
+                                  convert_tensor<float>(repetition_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (random_seed_opt.has_value()) {
+            input_tensors.insert(
+                {"random_seed",
+                 convert_tensor<unsigned long long int>(random_seed_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+
+        std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
+            {"output_ids",
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_INT32,
+                        std::vector<size_t>{request_batch_size, beam_width, (size_t)total_output_len},
+                        get_ptr<int>(output_ids)}},
+            {"sequence_length",
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_INT32,
+                        std::vector<size_t>{request_batch_size, beam_width},
+                        get_ptr<int>(sequence_lengths)}}};
+
+        if (return_cum_log_probs > 0) {
+            output_tensors.insert({"cum_log_probs",
+                                   ft::Tensor{ft::MEMORY_GPU,
+                                              ft::TYPE_FP32,
+                                              std::vector<size_t>{request_batch_size, beam_width},
+                                              get_ptr<float>(cum_log_probs)}});
+        }
+
+        try {
+            gpt.forward(&output_tensors, &input_tensors, &gpt_weights_);
+        }
+        catch (std::runtime_error& error) {
+            std::cout << error.what();
+            exit(-1);
+        }
+        catch (...) {
+            std::cout << "Runtime error";
+            exit(-1);
+        }
+    }
+
+private:
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t inter_size_;
+    const size_t layer_num_;
+    const size_t vocab_size_;
+    const size_t rotary_embedding_dim_;
+    const int    start_id_;
+    const int    end_id_;
+    const bool   use_gptj_residual_;
+
+    // const ft::gptVariantParams gpt_variant_params_;
+
+    std::vector<th::Tensor> weights_;
+    cublasLtHandle_t        cublasltHandle_;
+    std::mutex*             cublas_wrapper_mutex_;
+    ft::cublasAlgoMap*      cublas_algo_map_;
+    struct cudaDeviceProp   prop_;
+    ft::GptNeoXWeight<T>    gpt_weights_;
+
+    ft::NcclParam tensor_para_;
+    ft::NcclParam pipeline_para_;
+
+    int64_t tensor_para_size_;
+    int64_t pipeline_para_size_;
+};
+
+class GptNeoXOp: public th::jit::CustomClassHolder {
+public:
+    GptNeoXOp(const int64_t            head_num,
+              const int64_t            size_per_head,
+              const int64_t            inter_size,
+              const int64_t            layer_num,
+              const int64_t            vocab_size,
+              const int64_t            rotary_embedding_dim,
+              const int64_t            start_id,
+              const int64_t            end_id,
+              const int64_t            tensor_para_size,
+              const int64_t            pipeline_para_size,
+              const int64_t            max_seq_len,
+              const bool               use_gptj_residual,
+              const vector<th::Tensor> weights);
+
+    ~GptNeoXOp();
+
+    vector<th::Tensor> forward(th::Tensor               input_ids,
+                               th::Tensor               input_lengths,
+                               const int64_t            output_len,
+                               th::optional<int64_t>    beam_width_opt,
+                               th::optional<th::Tensor> top_k_opt,
+                               th::optional<th::Tensor> top_p_opt,
+                               th::optional<th::Tensor> beam_search_diversity_rate_opt,
+                               th::optional<th::Tensor> temperature_opt,
+                               th::optional<th::Tensor> len_penalty_opt,
+                               th::optional<th::Tensor> repetition_penalty_opt,
+                               th::optional<th::Tensor> random_seed_opt,
+                               th::optional<int64_t>    return_cum_log_probs_opt);
+
+private:
+    const at::ScalarType    st_;
+    IFGptNeoX*              ftgpt;
+    std::vector<th::Tensor> weights;
+};
+
+}  // namespace torch_ext
diff --git a/src/fastertransformer/utils/cublasMMWrapper.cc b/src/fastertransformer/utils/cublasMMWrapper.cc
index 12e6c8f0a..baf460fdc 100644
--- a/src/fastertransformer/utils/cublasMMWrapper.cc
+++ b/src/fastertransformer/utils/cublasMMWrapper.cc
@@ -799,7 +799,7 @@ std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHand
                                                                     cublasLtMatrixLayout_t Ddesc,
                                                                     cudaStream_t           stream)
 {
-#if (CUBLAS_VERSION) <= 11402
+#if (CUBLAS_VERSION) < 11601
     FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
     return {false, cublasLtMatmulAlgo_t{}};
 #else
@@ -984,7 +984,7 @@ void cublasMMWrapper::_Int8Gemm(const int     m,
      *  - 0: int8 * int8 -> int32 -> int8
      *  - 1: int8 * int8 -> int32 -> int32
      */
-#if (CUBLAS_VERSION) <= 11402
+#if (CUBLAS_VERSION) < 11601
     FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
 #else
 
diff --git a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc
index 165206710..474a8c81a 100644
--- a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc
+++ b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc
@@ -751,14 +751,14 @@ template void generate_gpt_gemm_config<__nv_fp8_e4m3>(int   batch_size,
                                                       bool  isAppend);
 #endif
 
-size_t calGptGemmTestBufSizeInByte(int            batch_size,
-                                   int            beam_width,
-                                   int            max_input_len,
-                                   int            head_num,
-                                   int            size_per_head,
-                                   int            inter_size,
-                                   int            vocab_size,
-                                   int            tensor_para_size,
+size_t calGptGemmTestBufSizeInByte(size_t         batch_size,
+                                   size_t         beam_width,
+                                   size_t         max_input_len,
+                                   size_t         head_num,
+                                   size_t         size_per_head,
+                                   size_t         inter_size,
+                                   size_t         vocab_size,
+                                   size_t         tensor_para_size,
                                    CublasDataType data_type)
 {
     size_t       buf_size_in_byte   = 0;
diff --git a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h
index 82eec3b1b..336ef95f6 100644
--- a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h
+++ b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h
@@ -50,14 +50,14 @@ void generate_gpt_gemm_config(int   batch_size,
                               void* buffer_in,
                               bool  isAppend);
 
-size_t calGptGemmTestBufSizeInByte(int            batch_size,
-                                   int            beam_width,
-                                   int            max_input_len,
-                                   int            head_num,
-                                   int            size_per_head,
-                                   int            inter_size,
-                                   int            vocab_size,
-                                   int            tensor_para_size,
+size_t calGptGemmTestBufSizeInByte(size_t         batch_size,
+                                   size_t         beam_width,
+                                   size_t         max_input_len,
+                                   size_t         head_num,
+                                   size_t         size_per_head,
+                                   size_t         inter_size,
+                                   size_t         vocab_size,
+                                   size_t         tensor_para_size,
                                    CublasDataType data_type);
 
 }  // namespace fastertransformer
diff --git a/tests/decoding/tf_fused_self_multihead_attention_unit_test.py b/tests/decoding/tf_fused_self_multihead_attention_unit_test.py
index a4a7031e5..a09c0028d 100644
--- a/tests/decoding/tf_fused_self_multihead_attention_unit_test.py
+++ b/tests/decoding/tf_fused_self_multihead_attention_unit_test.py
@@ -56,12 +56,12 @@ def test_attn_head_fp16(self):
             self.run_attn(4, 128, head, 64, tf.float16)
 
     def test_attn_size_fp32(self):
-        for size in [32, 64, 80, 96, 128, 144, 160, 192, 224, 256]:
+        for size in [32, 64, 80, 96, 112, 128, 144, 160, 192, 224, 256]:
             tf.reset_default_graph()
             self.run_attn(4, 128, 12, size, tf.float32)
 
     def test_attn_size_fp16(self):
-        for size in [32, 64, 80, 96, 128, 144, 160, 192, 224, 256]:
+        for size in [32, 64, 80, 96, 112, 128, 144, 160, 192, 224, 256]:
             tf.reset_default_graph()
             self.run_attn(4, 128, 12, size, tf.float16)
 
@@ -171,4 +171,4 @@ def run_attn(self, batch_size, seq_len, head_num, size_per_head, data_type):
             assert(v_cache_max_diff < threshold)
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/tests/unittests/test_gpt_kernels.cu b/tests/unittests/test_gpt_kernels.cu
index cef959078..c41308b8c 100644
--- a/tests/unittests/test_gpt_kernels.cu
+++ b/tests/unittests/test_gpt_kernels.cu
@@ -85,6 +85,7 @@ int test_find_context_dups()
             d_compact_size,
             d_input_ids,
             batch_size,
+            1,//beam_width
             vec_size);
 
     int compact_size;

From da9ef99a3ea0c2e3a1f1fbc083aeb76452d6b32f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:01:20 -0700
Subject: [PATCH 02/79] commit

---
 examples/cpp/llama/llama_config.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index ef789d35d..f9af743e7 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -6,7 +6,7 @@ tensor_para_size=1
 pipeline_para_size=1
 
 model_name=llama_7b
-model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/
+model_dir=/notebooks/llama2-7b-chat-tp8/
 
 [request]
 beam_width=1 # beam width for beam search

From 7f1e8bfeefd50ecb3889715eb3b67513871454ec Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:01:59 -0700
Subject: [PATCH 03/79] commit

---
 examples/cpp/llama/llama_config.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index f9af743e7..616a7581b 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -2,7 +2,7 @@
 data_type=fp16
 enable_custom_all_reduce=0
 
-tensor_para_size=1
+tensor_para_size=8
 pipeline_para_size=1
 
 model_name=llama_7b

From bb6fce458bb540ab6a7de634a1f81295d5b27f2b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:03:39 -0700
Subject: [PATCH 04/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 3df2a2203..ea6945660 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -316,7 +316,7 @@ int main(int argc, char* argv[])
     std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini";
 
     // step 1: Create model
-    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createLlamaModel(ini_name);
+    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createLlamaModel("/notebooks/llama2-7b-chat-tp8/config.ini");
     int                                       tensor_para_size = model->getTensorParaSize();
     int                                       pipeline_para_size = model->getPipelineParaSize();
     FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size),

From 49c94e8659fc1bf693325088b353d325fe734cc6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:05:02 -0700
Subject: [PATCH 05/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index ea6945660..3df2a2203 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -316,7 +316,7 @@ int main(int argc, char* argv[])
     std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini";
 
     // step 1: Create model
-    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createLlamaModel("/notebooks/llama2-7b-chat-tp8/config.ini");
+    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createLlamaModel(ini_name);
     int                                       tensor_para_size = model->getTensorParaSize();
     int                                       pipeline_para_size = model->getPipelineParaSize();
     FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size),

From c510c26adebb7fe24505cbfd6b61b37f57deb161 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:06:44 -0700
Subject: [PATCH 06/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 3df2a2203..2b9d9d663 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -242,10 +242,10 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
                        max_input_len,
                        end_id,
                        1,
-                       "../examples/cpp/llama/start_ids.csv");
+                       "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv");
 
     std::vector<int> v_bad_words;
-    ft::read_word_list("../examples/cpp/llama/bad_words.csv", v_bad_words);
+    ft::read_word_list("/notebooks/FasterTransformer/examples/cpp/llama/bad_words.csv", v_bad_words);
 
     RequestParam param;
     param.beam_width                 = reader.GetInteger("request", "beam_width");

From 8933482e0b82fa33e20b7b322ca49008408be526 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:08:12 -0700
Subject: [PATCH 07/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 2b9d9d663..a51fcbf32 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -378,6 +378,7 @@ int main(int argc, char* argv[])
     const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
     const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
     const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("input_lengths").data;
+    printf("Here\n");
     // step 6: check results
     if (node_id == 0) {
 

From 98ab7df047d87b320795056ca55cd7b6796b61b8 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:08:55 -0700
Subject: [PATCH 08/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index a51fcbf32..68fd10f58 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -377,8 +377,9 @@ int main(int argc, char* argv[])
     const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
     const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
     const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
+        printf("Here\n");
     const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("input_lengths").data;
-    printf("Here\n");
+
     // step 6: check results
     if (node_id == 0) {
 

From 626287a3f541b2dce493fba8c3cc20f981b12b24 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:10:00 -0700
Subject: [PATCH 09/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 68fd10f58..9f08500f7 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -358,7 +358,7 @@ int main(int argc, char* argv[])
     // step 5: Forward
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> output_tensors_lists(
         (size_t)gpu_count);
-    for (int i = 0; i < 2; i++) {
+    for (int i = 0; i < 1; i++) {
         threads.clear();
         for (int device_id = 0; device_id < gpu_count; device_id++) {
             threads.push_back(std::thread(threadForward,

From 787c1c5c2de5f23b44989a44fb626be3e9c88ea2 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:10:54 -0700
Subject: [PATCH 10/79] commit

---
 .vscode/settings.json                      | 3 ++-
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 82000232b..79166a171 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -68,6 +68,7 @@
         "future": "cpp",
         "cfenv": "cpp",
         "typeindex": "cpp",
-        "variant": "cpp"
+        "variant": "cpp",
+        "ios": "cpp"
     }
 }
diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 9f08500f7..4b8f90563 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -379,7 +379,7 @@ int main(int argc, char* argv[])
     const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
         printf("Here\n");
     const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("input_lengths").data;
-
+printf("Here\n");
     // step 6: check results
     if (node_id == 0) {
 

From 06e941b9083b7c6b665b0e370566b17a41df6393 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:12:19 -0700
Subject: [PATCH 11/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 4b8f90563..5ecb737b7 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -377,9 +377,6 @@ int main(int argc, char* argv[])
     const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
     const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
     const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
-        printf("Here\n");
-    const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("input_lengths").data;
-printf("Here\n");
     // step 6: check results
     if (node_id == 0) {
 

From ce8272aab00aab7abc619389e2a8dcaa41dd5672 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:12:47 -0700
Subject: [PATCH 12/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 5ecb737b7..0b57c3ac4 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -390,7 +390,6 @@ int main(int argc, char* argv[])
             int*   hBuf     = new int[outCount];
             int*   iBuf     = new int[batch_size];
             ft::cudaD2Hcpy(hBuf, d_output_ids, outCount);
-            ft::cudaD2Hcpy(iBuf, d_input_lengths, batch_size);
             
 
             {

From c6f25436999542406cbfceaeabf1e0ce93fe8be6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:25:20 -0700
Subject: [PATCH 13/79] commit

---
 examples/cpp/llama/start_ids.csv | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 612c85964..891837f48 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1 +1,10 @@
-1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962

From e1f2a76107b760f921a82f4bf75c7146513a3092 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:26:10 -0700
Subject: [PATCH 14/79] commit

---
 examples/cpp/llama/llama_config.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index 616a7581b..ee3ebbaff 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -17,8 +17,8 @@ repetition_penalty=1.0 ; Use for sampling
 presence_penalty=0.0  ; Only one of repetition_penalty and presence_penalty are allowed.
 len_penalty=0.0
 beam_search_diversity_rate=0.0
-request_batch_size=8 # determine by the request
-request_output_len=32 # determine by the request
+request_batch_size=20 # determine by the request
+request_output_len=512 # determine by the request
 
 [llama_7b]
 head_num = 64

From 441c343a6467f51ada3c38d8a607bca2f4c92fcc Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:27:37 -0700
Subject: [PATCH 15/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 0b57c3ac4..68570914b 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -395,9 +395,9 @@ int main(int argc, char* argv[])
             {
                 std::cout << "Writing " << outCount << " elements\n";
                 int zeroCount = 0;
-                for (int i=0; i<batch_size; i++) {
-                    printf("%d ", iBuf[i]);
-                }
+                // for (int i=0; i<batch_size; i++) {
+                //     printf("%d ", iBuf[i]);
+                // }
                 printf("\n");
                 for (size_t i = 0; i < outCount; i++) {
                     if (hBuf[i] == int(0))
@@ -407,7 +407,7 @@ int main(int argc, char* argv[])
                         outFile << std::endl;
 
                     // if (i < 10)
-                        printf("%5d ", hBuf[i]);
+                        printf("%d ", hBuf[i]);
                     // if ((i + 1) % (seq_len) == 0 && i < 10)
                     //     std::cout << std::endl;
                 }

From 3cf5490718bce663a41155f6353646662039b7a6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 20:28:39 -0700
Subject: [PATCH 16/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 68570914b..4366c8534 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -404,7 +404,7 @@ int main(int argc, char* argv[])
                         zeroCount++;
                     outFile << hBuf[i] << " ";
                     if ((i + 1) % (seq_len) == 0)
-                        outFile << std::endl;
+                        printf("\n\n");
 
                     // if (i < 10)
                         printf("%d ", hBuf[i]);

From 38919b65a99d1e30d84f19f5de42a74d924bbd4a Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:01:26 -0700
Subject: [PATCH 17/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 4366c8534..65e00187a 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -407,7 +407,7 @@ int main(int argc, char* argv[])
                         printf("\n\n");
 
                     // if (i < 10)
-                        printf("%d ", hBuf[i]);
+                        printf("%d,", hBuf[i]);
                     // if ((i + 1) % (seq_len) == 0 && i < 10)
                     //     std::cout << std::endl;
                 }

From 4c0dbba5fdbda5ef0138b03ceba608d86506ca92 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:05:14 -0700
Subject: [PATCH 18/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 65e00187a..e71a77695 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -377,6 +377,7 @@ int main(int argc, char* argv[])
     const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
     const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
     const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
+    printf("%d %d %d\n", batch_size, beam_width, seq_len);
     // step 6: check results
     if (node_id == 0) {
 

From b6945af49b16afaab165cb3d60ed7d3286692369 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:08:15 -0700
Subject: [PATCH 19/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index e71a77695..559f7afdd 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -353,7 +353,7 @@ int main(int argc, char* argv[])
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
         prepareRequest(ini_name, node_id, gpu_count, &pointer_record);
-    printf("[INFO] request is created \n");
+    printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> output_tensors_lists(

From 728f8900488a2547f85d3c5c764fe892a91fe128 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:09:22 -0700
Subject: [PATCH 20/79] commit

---
 examples/cpp/llama/llama_config.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index ee3ebbaff..4257c9c5a 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -17,7 +17,7 @@ repetition_penalty=1.0 ; Use for sampling
 presence_penalty=0.0  ; Only one of repetition_penalty and presence_penalty are allowed.
 len_penalty=0.0
 beam_search_diversity_rate=0.0
-request_batch_size=20 # determine by the request
+request_batch_size=10 # determine by the request
 request_output_len=512 # determine by the request
 
 [llama_7b]

From 8aeb13ae76c12cdb2cf49372bb1da929fff0f6e1 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:17:44 -0700
Subject: [PATCH 21/79] commit

---
 examples/cpp/llama/llama_config.ini | 2 +-
 examples/cpp/llama/start_ids.csv    | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index 4257c9c5a..19f6d016d 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -17,7 +17,7 @@ repetition_penalty=1.0 ; Use for sampling
 presence_penalty=0.0  ; Only one of repetition_penalty and presence_penalty are allowed.
 len_penalty=0.0
 beam_search_diversity_rate=0.0
-request_batch_size=10 # determine by the request
+request_batch_size=1 # determine by the request
 request_output_len=512 # determine by the request
 
 [llama_7b]
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 891837f48..651307e8e 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1,10 +1 @@
-1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962

From ffd2f9693ab553a6802e9f5dc1b91edf5aec81db Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:19:11 -0700
Subject: [PATCH 22/79] commit

---
 examples/cpp/llama/llama_config.ini | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index 19f6d016d..0fcd5d68a 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -9,11 +9,11 @@ model_name=llama_7b
 model_dir=/notebooks/llama2-7b-chat-tp8/
 
 [request]
-beam_width=1 # beam width for beam search
-top_k=1 ; k value for top k sampling
+beam_width=0 # beam width for beam search
+top_k=0 ; k value for top k sampling
 top_p=0.0 ; p value for top p sampling
-temperature=1.0 ; Use for sampling
-repetition_penalty=1.0 ; Use for sampling
+temperature=0 ; Use for sampling
+repetition_penalty=0 ; Use for sampling
 presence_penalty=0.0  ; Only one of repetition_penalty and presence_penalty are allowed.
 len_penalty=0.0
 beam_search_diversity_rate=0.0

From 18eb7b4e354cfaf32a3cb7abb7a3900fd6454bb1 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:20:00 -0700
Subject: [PATCH 23/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 130 ++++++++++-----------
 1 file changed, 65 insertions(+), 65 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 559f7afdd..fe95c1396 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -143,71 +143,71 @@ broadCastRequest(const std::vector<int>& v_start_ids,
                 {"end_id",
                  triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}}));
 
-        int* beam_width_ptr = new int(param.beam_width);
-        pointer_record->push_back(beam_width_ptr);
-        request_list[device_id]->insert(
-            {"beam_width",
-             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, beam_width_ptr}});
-        if (param.beam_width > 1) {
-            float* beam_search_diversity_rate_ptr = new float(param.beam_search_diversity_rate);
-            pointer_record->push_back(beam_search_diversity_rate_ptr);
-            request_list[device_id]->insert(
-                {"beam_search_diversity_rate",
-                 triton::Tensor{
-                     triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, beam_search_diversity_rate_ptr}});
-        }
-        else {
-            if (param.runtime_top_p != 0.0f) {
-                float* runtime_top_p_ptr = new float(param.runtime_top_p);
-                pointer_record->push_back(runtime_top_p_ptr);
-                request_list[device_id]->insert(
-                    {"runtime_top_p",
-                     triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, runtime_top_p_ptr}});
-            }
-            if (param.runtime_top_k != 0) {
-                uint* runtime_top_k_ptr = new uint(param.runtime_top_k);
-                pointer_record->push_back(runtime_top_k_ptr);
-                request_list[device_id]->insert(
-                    {"runtime_top_k",
-                     triton::Tensor{
-                         triton::MEMORY_CPU, triton::TYPE_UINT32, std::vector<size_t>{1}, runtime_top_k_ptr}});
-            }
-        }
-        float* temperature_ptr = new float(param.temperature);
-        pointer_record->push_back(temperature_ptr);
-        request_list[device_id]->insert(
-            {"temperature",
-             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, temperature_ptr}});
-        float* len_penalty_ptr = new float(param.len_penalty);
-        pointer_record->push_back(len_penalty_ptr);
-        request_list[device_id]->insert(
-            {"len_penalty",
-             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, len_penalty_ptr}});
-        if (param.repetition_penalty != 1.0f) {
-            float* repetition_penalty_ptr = new float(param.repetition_penalty);
-            pointer_record->push_back(repetition_penalty_ptr);
-            request_list[device_id]->insert(
-                {"repetition_penalty",
-                 triton::Tensor{
-                     triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, repetition_penalty_ptr}});
-        }
-        if (param.presence_penalty != 0.0f) {
-            float* presence_penalty_ptr = new float(param.presence_penalty);
-            pointer_record->push_back(presence_penalty_ptr);
-            request_list[device_id]->insert(
-                {"presence_penalty",
-                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, presence_penalty_ptr}});
-        }
-        int* min_length_ptr = new int(param.min_length);
-        pointer_record->push_back(min_length_ptr);
-        request_list[device_id]->insert(
-            {"min_length",
-             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, min_length_ptr}});
-        unsigned long long int* random_seed_ptr = new unsigned long long int(param.random_seed);
-        pointer_record->push_back(random_seed_ptr);
-        request_list[device_id]->insert(
-            {"random_seed",
-             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_UINT64, std::vector<size_t>{1}, random_seed_ptr}});
+        // int* beam_width_ptr = new int(param.beam_width);
+        // pointer_record->push_back(beam_width_ptr);
+        // request_list[device_id]->insert(
+        //     {"beam_width",
+        //      triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, beam_width_ptr}});
+        // if (param.beam_width > 1) {
+        //     float* beam_search_diversity_rate_ptr = new float(param.beam_search_diversity_rate);
+        //     pointer_record->push_back(beam_search_diversity_rate_ptr);
+        //     request_list[device_id]->insert(
+        //         {"beam_search_diversity_rate",
+        //          triton::Tensor{
+        //              triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, beam_search_diversity_rate_ptr}});
+        // }
+        // else {
+        //     if (param.runtime_top_p != 0.0f) {
+        //         float* runtime_top_p_ptr = new float(param.runtime_top_p);
+        //         pointer_record->push_back(runtime_top_p_ptr);
+        //         request_list[device_id]->insert(
+        //             {"runtime_top_p",
+        //              triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, runtime_top_p_ptr}});
+        //     }
+        //     if (param.runtime_top_k != 0) {
+        //         uint* runtime_top_k_ptr = new uint(param.runtime_top_k);
+        //         pointer_record->push_back(runtime_top_k_ptr);
+        //         request_list[device_id]->insert(
+        //             {"runtime_top_k",
+        //              triton::Tensor{
+        //                  triton::MEMORY_CPU, triton::TYPE_UINT32, std::vector<size_t>{1}, runtime_top_k_ptr}});
+        //     }
+        // }
+        // float* temperature_ptr = new float(param.temperature);
+        // pointer_record->push_back(temperature_ptr);
+        // request_list[device_id]->insert(
+        //     {"temperature",
+        //      triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, temperature_ptr}});
+        // float* len_penalty_ptr = new float(param.len_penalty);
+        // pointer_record->push_back(len_penalty_ptr);
+        // request_list[device_id]->insert(
+        //     {"len_penalty",
+        //      triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, len_penalty_ptr}});
+        // if (param.repetition_penalty != 1.0f) {
+        //     float* repetition_penalty_ptr = new float(param.repetition_penalty);
+        //     pointer_record->push_back(repetition_penalty_ptr);
+        //     request_list[device_id]->insert(
+        //         {"repetition_penalty",
+        //          triton::Tensor{
+        //              triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, repetition_penalty_ptr}});
+        // }
+        // if (param.presence_penalty != 0.0f) {
+        //     float* presence_penalty_ptr = new float(param.presence_penalty);
+        //     pointer_record->push_back(presence_penalty_ptr);
+        //     request_list[device_id]->insert(
+        //         {"presence_penalty",
+        //          triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, presence_penalty_ptr}});
+        // }
+        // int* min_length_ptr = new int(param.min_length);
+        // pointer_record->push_back(min_length_ptr);
+        // request_list[device_id]->insert(
+        //     {"min_length",
+        //      triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, min_length_ptr}});
+        // unsigned long long int* random_seed_ptr = new unsigned long long int(param.random_seed);
+        // pointer_record->push_back(random_seed_ptr);
+        // request_list[device_id]->insert(
+        //     {"random_seed",
+        //      triton::Tensor{triton::MEMORY_CPU, triton::TYPE_UINT64, std::vector<size_t>{1}, random_seed_ptr}});
 
         pointer_record->push_back(d_input_ids);
         pointer_record->push_back(d_input_lengths);

From 28cba076e48c185683c17e959b6b121da048baa8 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:21:45 -0700
Subject: [PATCH 24/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index fe95c1396..61f32c1be 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -401,14 +401,15 @@ int main(int argc, char* argv[])
                 // }
                 printf("\n");
                 for (size_t i = 0; i < outCount; i++) {
-                    if (hBuf[i] == int(0))
-                        zeroCount++;
-                    outFile << hBuf[i] << " ";
-                    if ((i + 1) % (seq_len) == 0)
-                        printf("\n\n");
+                    // if (hBuf[i] == int(0))
+                    //     zeroCount++;
+                    // outFile << hBuf[i] << " ";
+
 
                     // if (i < 10)
                         printf("%d,", hBuf[i]);
+                    if ((i + 1) % (seq_len) == 0)
+                        printf("\n\n");
                     // if ((i + 1) % (seq_len) == 0 && i < 10)
                     //     std::cout << std::endl;
                 }

From 165704c9687c367b98d874fc63a5a3b4667d6501 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:24:04 -0700
Subject: [PATCH 25/79] commit

---
 examples/cpp/llama/start_ids.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 651307e8e..218eb01e5 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1 +1 @@
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2

From d792097eb468807e85e43ddf1f43689dc1897347 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:28:27 -0700
Subject: [PATCH 26/79] commit

---
 examples/cpp/llama/start_ids.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 218eb01e5..7cbfab468 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1 +1 @@
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2

From f2534bed301d295c990a2e5427338051c6da8888 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:30:51 -0700
Subject: [PATCH 27/79] commit

---
 examples/cpp/llama/start_ids.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 7cbfab468..218eb01e5 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1 +1 @@
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2

From 67aa2849e789566a59d2f3cc50132cb6923eb6dd Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:31:40 -0700
Subject: [PATCH 28/79] commit

---
 examples/cpp/llama/start_ids.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 218eb01e5..651307e8e 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1 +1 @@
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962

From b41505565abb4ec26e50b931e496f3860dfd620c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:33:36 -0700
Subject: [PATCH 29/79] commit

---
 examples/cpp/llama/start_ids.csv | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 651307e8e..891837f48 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1 +1,10 @@
+1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962

From 2ecae5e9baccbe891b5ab6c855df85afb89f7208 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:37:56 -0700
Subject: [PATCH 30/79] commit

---
 examples/cpp/llama/llama_config.ini |  2 +-
 examples/cpp/llama/start_ids.csv    | 12 ++----------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index 0fcd5d68a..8d34ce813 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -17,7 +17,7 @@ repetition_penalty=0 ; Use for sampling
 presence_penalty=0.0  ; Only one of repetition_penalty and presence_penalty are allowed.
 len_penalty=0.0
 beam_search_diversity_rate=0.0
-request_batch_size=1 # determine by the request
+request_batch_size=2 # determine by the request
 request_output_len=512 # determine by the request
 
 [llama_7b]
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 891837f48..12a321d93 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1,10 +1,2 @@
-1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962
+1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,29991,518,29914,25580,29962
+1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,29889,518,29914,25580,29962

From 455c6b8155625addbca4e543504218aae9506e27 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:40:14 -0700
Subject: [PATCH 31/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 61f32c1be..2026ce444 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -219,7 +219,7 @@ broadCastRequest(const std::vector<int>& v_start_ids,
 }
 
 std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
-prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record)
+prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record, string file_name)
 {
     INIReader reader = INIReader(ini_name);
     if (reader.ParseError() < 0) {
@@ -242,7 +242,7 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
                        max_input_len,
                        end_id,
                        1,
-                       "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv");
+                       file_name);
 
     std::vector<int> v_bad_words;
     ft::read_word_list("/notebooks/FasterTransformer/examples/cpp/llama/bad_words.csv", v_bad_words);

From 9ecbfc9b19d79ebfd0b49837056226c14f923f78 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:40:28 -0700
Subject: [PATCH 32/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 2026ce444..3efd71fb7 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -352,7 +352,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv");
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward

From cb8b38df12dc1d8a8289c7cbb1b3e6323d7975a8 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:40:57 -0700
Subject: [PATCH 33/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 3efd71fb7..ac36be20a 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -352,7 +352,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv");
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"));
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward

From 662d60570664c566e1a4c986e970794a8b554fb4 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:41:29 -0700
Subject: [PATCH 34/79] commit

---
 .vscode/settings.json                      | 16 +++++++++++++++-
 examples/cpp/llama/llama_triton_example.cc |  2 +-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 79166a171..1ef97bcca 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -69,6 +69,20 @@
         "cfenv": "cpp",
         "typeindex": "cpp",
         "variant": "cpp",
-        "ios": "cpp"
+        "ios": "cpp",
+        "__bit_reference": "cpp",
+        "__config": "cpp",
+        "__debug": "cpp",
+        "__errc": "cpp",
+        "__hash_table": "cpp",
+        "__locale": "cpp",
+        "__mutex_base": "cpp",
+        "__node_handle": "cpp",
+        "__split_buffer": "cpp",
+        "__threading_support": "cpp",
+        "__tree": "cpp",
+        "__verbose_abort": "cpp",
+        "charconv": "cpp",
+        "locale": "cpp"
     }
 }
diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index ac36be20a..3f22cddf2 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -219,7 +219,7 @@ broadCastRequest(const std::vector<int>& v_start_ids,
 }
 
 std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
-prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record, string file_name)
+prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record, std::string file_name)
 {
     INIReader reader = INIReader(ini_name);
     if (reader.ParseError() < 0) {

From ec6516133859b454a00ef6060576a499fb6646b6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:42:25 -0700
Subject: [PATCH 35/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 3f22cddf2..f476f999f 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -349,6 +349,7 @@ int main(int argc, char* argv[])
         t.join();
     }
 
+{
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
@@ -418,6 +419,7 @@ int main(int argc, char* argv[])
             delete[] hBuf;
         }
     }
+}
 
     // test time
     struct timeval start, end;

From d2c0e8f4524d883610d035aefe92f4a24ad0529b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:42:58 -0700
Subject: [PATCH 36/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 40 +++++++++++-----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index f476f999f..d3ca14669 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -421,26 +421,26 @@ int main(int argc, char* argv[])
     }
 }
 
-    // test time
-    struct timeval start, end;
-    ft::mpi::barrier();
-    cudaDeviceSynchronize();
-    gettimeofday(&start, NULL);
-
-    const int ite = 1;
-    for (int i = 0; i < ite; i++) {
-        threads.clear();
-        for (int device_id = 0; device_id < gpu_count; device_id++) {
-            threads.push_back(std::thread(threadForward,
-                                          &model_instances[device_id],
-                                          request_list[device_id],
-                                          &output_tensors_lists[device_id],
-                                          device_id));
-        }
-        for (auto& t : threads) {
-            t.join();
-        }
-    }
+    // // test time
+    // struct timeval start, end;
+    // ft::mpi::barrier();
+    // cudaDeviceSynchronize();
+    // gettimeofday(&start, NULL);
+
+    // const int ite = 1;
+    // for (int i = 0; i < ite; i++) {
+    //     threads.clear();
+    //     for (int device_id = 0; device_id < gpu_count; device_id++) {
+    //         threads.push_back(std::thread(threadForward,
+    //                                       &model_instances[device_id],
+    //                                       request_list[device_id],
+    //                                       &output_tensors_lists[device_id],
+    //                                       device_id));
+    //     }
+    //     for (auto& t : threads) {
+    //         t.join();
+    //     }
+    // }
 
     cudaDeviceSynchronize();
     ft::mpi::barrier();

From 7afad53934c6246d2cf9cb766a469a73154a5db4 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:43:18 -0700
Subject: [PATCH 37/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index d3ca14669..a669b16af 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -445,14 +445,14 @@ int main(int argc, char* argv[])
     cudaDeviceSynchronize();
     ft::mpi::barrier();
 
-    gettimeofday(&end, NULL);
-
-    printf("[INFO] batch_size %d beam_width %d seq_len %d"
-           " FT-CPP-GPT-Triton-time %.2f ms\n",
-           batch_size,
-           beam_width,
-           seq_len,
-           ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
+    // gettimeofday(&end, NULL);
+
+    // printf("[INFO] batch_size %d beam_width %d seq_len %d"
+    //        " FT-CPP-GPT-Triton-time %.2f ms\n",
+    //        batch_size,
+    //        beam_width,
+    //        seq_len,
+    //        ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
 
     ft::mpi::finalize();
     return 0;

From 2804459b01da85525dfc0b6d666b665a3562e361 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:44:00 -0700
Subject: [PATCH 38/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 72 ++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index a669b16af..b9bd81b13 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -394,6 +394,78 @@ int main(int argc, char* argv[])
             ft::cudaD2Hcpy(hBuf, d_output_ids, outCount);
             
 
+            {
+                std::cout << "Writing " << outCount << " elements\n";
+                int zeroCount = 0;
+                // for (int i=0; i<batch_size; i++) {
+                //     printf("%d ", iBuf[i]);
+                // }
+                printf("\n");
+                for (size_t i = 0; i < outCount; i++) {
+                    // if (hBuf[i] == int(0))
+                    //     zeroCount++;
+                    // outFile << hBuf[i] << " ";
+
+
+                    // if (i < 10)
+                        printf("%d,", hBuf[i]);
+                    if ((i + 1) % (seq_len) == 0)
+                        printf("\n\n");
+                    // if ((i + 1) % (seq_len) == 0 && i < 10)
+                    //     std::cout << std::endl;
+                }
+                std::cout << std::endl << "zeroCount = " << zeroCount << std::endl;
+            }
+            delete[] hBuf;
+        }
+    }
+}
+
+{
+    // step 4: prepare request
+    std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"));
+    printf("[INFO] request is created : %d\n", request_list.size());
+
+    // step 5: Forward
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> output_tensors_lists(
+        (size_t)gpu_count);
+    for (int i = 0; i < 1; i++) {
+        threads.clear();
+        for (int device_id = 0; device_id < gpu_count; device_id++) {
+            threads.push_back(std::thread(threadForward,
+                                          &model_instances[device_id],
+                                          request_list[device_id],
+                                          &output_tensors_lists[device_id],
+                                          device_id));
+        }
+        for (auto& t : threads) {
+            t.join();
+        }
+    }
+    printf("[INFO] forward is completed. \n");
+
+    const int* d_output_ids = (const int*)output_tensors_lists[0].get()->at("output_ids").data;
+    const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
+    const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
+    const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
+    printf("%d %d %d\n", batch_size, beam_width, seq_len);
+    // step 6: check results
+    if (node_id == 0) {
+
+        std::string fName   = "out";
+        auto        outFile = std::ofstream(fName, std::ios::out);
+        if (!outFile.is_open()) {
+            printf("[WARNING] Cannot write results into output file %s \n", fName.c_str());
+        }
+        else {
+            size_t outCount = batch_size * beam_width * seq_len;
+            int*   hBuf     = new int[outCount];
+            int*   iBuf     = new int[batch_size];
+            ft::cudaD2Hcpy(hBuf, d_output_ids, outCount);
+            
+
             {
                 std::cout << "Writing " << outCount << " elements\n";
                 int zeroCount = 0;

From 3e431f9514b20d7c43586b4b4eac029cb93e99fa Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:45:00 -0700
Subject: [PATCH 39/79] commit

---
 examples/cpp/llama/llama_triton_example.cc |  2 +-
 examples/cpp/llama/start_ids.csv           | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index b9bd81b13..1f9558c7f 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -425,7 +425,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"));
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids2.csv"));
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 12a321d93..891837f48 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1,2 +1,10 @@
-1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,29991,518,29914,25580,29962
-1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,29889,518,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962

From 748aa0c1b0cab039d5162941a1d2a1ab86cb2247 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:46:06 -0700
Subject: [PATCH 40/79] commit

---
 examples/cpp/llama/start_ids2.csv | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 examples/cpp/llama/start_ids2.csv

diff --git a/examples/cpp/llama/start_ids2.csv b/examples/cpp/llama/start_ids2.csv
new file mode 100644
index 000000000..12a321d93
--- /dev/null
+++ b/examples/cpp/llama/start_ids2.csv
@@ -0,0 +1,2 @@
+1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,29991,518,29914,25580,29962
+1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,29889,518,29914,25580,29962

From 1f4210e5ea9262e5a6f7fe537162bc17138e8ead Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:46:37 -0700
Subject: [PATCH 41/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 1f9558c7f..330fa10d5 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -219,7 +219,7 @@ broadCastRequest(const std::vector<int>& v_start_ids,
 }
 
 std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
-prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record, std::string file_name)
+prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record, std::string file_name, size_t request_batch_size)
 {
     INIReader reader = INIReader(ini_name);
     if (reader.ParseError() < 0) {
@@ -227,7 +227,7 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
         ft::FT_CHECK(false);
     }
 
-    const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
+    // const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
 
     const int start_id = reader.GetInteger("llama_7b", "start_id");
     const int end_id   = reader.GetInteger("llama_7b", "end_id");
@@ -353,7 +353,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"));
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
@@ -425,7 +425,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids2.csv"));
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids2.csv"), 2);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward

From f1dac5cd0ad8b7c028d8af8cbf980edeb9212f00 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:50:02 -0700
Subject: [PATCH 42/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 330fa10d5..6fd22b8b7 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -353,7 +353,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record,  std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids2.csv"), 2);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
@@ -425,7 +425,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids2.csv"), 2);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward

From 40d7e8b196f2daf5bc31fc3545c104ac27bcc474 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:52:35 -0700
Subject: [PATCH 43/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 examples/cpp/llama/start_ids.csv           | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 6fd22b8b7..b1ac4335a 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -425,7 +425,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 1);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 891837f48..e1389ab16 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1,10 +1 @@
 1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962

From dd09060645637c0707cb49756cceb18d4311c702 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:54:09 -0700
Subject: [PATCH 44/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 examples/cpp/llama/start_ids.csv           | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index b1ac4335a..18063d828 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -425,7 +425,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 1);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 2);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index e1389ab16..7563a7588 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1 +1,2 @@
 1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962

From f8f43a4bca89d0b70f8fa53a486e7f7a8ed1aeee Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:55:16 -0700
Subject: [PATCH 45/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 examples/cpp/llama/start_ids.csv           | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 18063d828..30bd0f108 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -425,7 +425,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 2);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 5);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 7563a7588..d271af9fa 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1,2 +1,5 @@
 1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962

From 7bfb5186523be90680ae03495018c291aad39dd9 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:56:03 -0700
Subject: [PATCH 46/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 examples/cpp/llama/start_ids.csv           | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 30bd0f108..6fd22b8b7 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -425,7 +425,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 5);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index d271af9fa..891837f48 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -3,3 +3,8 @@
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962

From cfeec21a733940d18d455caa3b6eaf358f3c0b05 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:57:00 -0700
Subject: [PATCH 47/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 examples/cpp/llama/start_ids.csv           | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 6fd22b8b7..18063d828 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -425,7 +425,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 2);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 891837f48..38ce8e45b 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1,10 +1,2 @@
 1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962
-1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962

From dadcc23ee51855a9b0b2f0e2872bfc0fa5a79d35 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:58:47 -0700
Subject: [PATCH 48/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 examples/cpp/llama/start_ids.csv           | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 18063d828..6e2e15971 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -425,7 +425,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 2);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 8);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 38ce8e45b..d1152bbd7 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1,2 +1,8 @@
 1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962

From 99ffd0ed1e4d4f0b24abc52ceaa23409b817fa8c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 21:59:42 -0700
Subject: [PATCH 49/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 2 +-
 examples/cpp/llama/start_ids.csv           | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 6e2e15971..6fd22b8b7 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -425,7 +425,7 @@ int main(int argc, char* argv[])
     // step 4: prepare request
     std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 8);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10);
     printf("[INFO] request is created : %d\n", request_list.size());
 
     // step 5: Forward
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index d1152bbd7..891837f48 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -6,3 +6,5 @@
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962
 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962
+1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962

From a89d4b33b7dbf06dcdcceacf4b67543d9c09a882 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:03:23 -0700
Subject: [PATCH 50/79] commit

---
 src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
index 14d31d02d..2ddedd629 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
@@ -197,7 +197,7 @@ std::unique_ptr<AbstractTransformerModelInstance> LlamaTritonModel<T>::createMod
                      stream,
                      cublas_wrapper.get(),
                      allocator.get(),
-                     false,
+                     true,
                      cuda_device_prop_ptr.get(),
                      attention_type,
                      int8_mode_,

From 4973c9b65928d9a9ad357ca46c1ee70a64f348df Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:05:18 -0700
Subject: [PATCH 51/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 6fd22b8b7..d1f163146 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -481,8 +481,10 @@ int main(int argc, char* argv[])
 
                     // if (i < 10)
                         printf("%d,", hBuf[i]);
-                    if ((i + 1) % (seq_len) == 0)
+                    if ((i + 1) % (seq_len) == 0) {
                         printf("\n\n");
+                        break;
+                    }
                     // if ((i + 1) % (seq_len) == 0 && i < 10)
                     //     std::cout << std::endl;
                 }

From caf7c00f699e6bfc7b76cb20258e70a2f87a58eb Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:05:49 -0700
Subject: [PATCH 52/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 23c5c4e4a..01aaa88d4 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -365,6 +365,7 @@ Llama<T>::Llama(size_t                              head_num,
     int8_mode_(int8_mode),
     shared_contexts_ratio_(shared_contexts_ratio)
 {
+    printf("is_free_buffer_after_forward: %d\n", is_free_buffer_after_forward);
     int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
     if (std::is_same<half, T>::value) {
         local_vacab_size = ceil(local_vacab_size / 8.f) * 8;

From f7af3698365667967c7c863af30b6a171a7d7fce Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:07:33 -0700
Subject: [PATCH 53/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 44 ++++++++++-----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 01aaa88d4..b0dcbf59b 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -115,19 +115,19 @@ void Llama<T>::allocateBuffer(
     }
 
     input_attention_mask_ = (T*)(allocator_->reMalloc(
-        input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false));
-    decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+        input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, true));
+    decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, true));
     decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, true));
     normed_decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
-    logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
+        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, true));
+    logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, true));
     nccl_logits_buf_ =
-        (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
-    cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false));
-    finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false));
+        (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, true));
+    cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, true));
+    finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, true));
     h_finished_buf_   = new bool[batchxbeam];
-    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
+    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, true));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
     value_cache_ = key_cache_ + self_cache_size;
@@ -139,40 +139,40 @@ void Llama<T>::allocateBuffer(
 
     // prompt_learning weight batch ptrs
     prompt_learning_weight_batch_ =
-        (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
+        (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, true));
     tiled_prompt_lengths_buf_ =
-        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
+        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true));
 
     tiled_input_ids_buf_ =
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));
     tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true));
     tiled_total_padding_count_ =
-        (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false);
+        (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), true);
 
     transposed_output_ids_buf_ =
         (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
     output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
     parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
-    seq_limit_len_  = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false));
+    seq_limit_len_  = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, true));
     masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true));
 
-    start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false));
-    end_ids_buf_   = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false));
+    start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, true));
+    end_ids_buf_   = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, true));
 
     context_decoder_input_buf_  = (T*)(allocator_->reMalloc(
-        context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
+        context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, true));
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
-        context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
+        context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, true));
     output_log_probs_buf_ =
-        (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false));
+        (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, true));
 
     generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true);
 
     if (shared_contexts_ratio_ > 0.0f) {
-        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false);
-        batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false);
-        compact_idx_          = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false);
-        compact_size_         = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false);
+        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), true);
+        batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), true);
+        compact_idx_          = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), true);
+        compact_size_         = (int*)allocator_->reMalloc(compact_size_, sizeof(int), true);
     }
 
     is_allocate_buffer_ = true;

From 4545f61daca4373caef52782d9abcaa6f5400245 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:08:59 -0700
Subject: [PATCH 54/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index b0dcbf59b..7fdec4cd2 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -160,19 +160,19 @@ void Llama<T>::allocateBuffer(
     end_ids_buf_   = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, true));
 
     context_decoder_input_buf_  = (T*)(allocator_->reMalloc(
-        context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, true));
+        context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
         context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, true));
     output_log_probs_buf_ =
-        (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, true));
+        (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false));
 
     generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true);
 
     if (shared_contexts_ratio_ > 0.0f) {
-        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), true);
-        batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), true);
-        compact_idx_          = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), true);
-        compact_size_         = (int*)allocator_->reMalloc(compact_size_, sizeof(int), true);
+        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false);
+        batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false);
+        compact_idx_          = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false);
+        compact_size_         = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false);
     }
 
     is_allocate_buffer_ = true;

From 376110d5bfa7dae78c6a698d0fd6e6625772735a Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:09:39 -0700
Subject: [PATCH 55/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 7fdec4cd2..39b37fdfd 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -162,7 +162,7 @@ void Llama<T>::allocateBuffer(
     context_decoder_input_buf_  = (T*)(allocator_->reMalloc(
         context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
-        context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, true));
+        context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
     output_log_probs_buf_ =
         (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false));
 

From 74df02735cb399a6a55a40344b88dd91a34332b7 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:10:22 -0700
Subject: [PATCH 56/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 39b37fdfd..d08b892f5 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -147,17 +147,17 @@ void Llama<T>::allocateBuffer(
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));
     tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true));
     tiled_total_padding_count_ =
-        (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), true);
+        (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false);
 
     transposed_output_ids_buf_ =
         (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
     output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
     parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
-    seq_limit_len_  = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, true));
+    seq_limit_len_  = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false));
     masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true));
 
-    start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, true));
-    end_ids_buf_   = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, true));
+    start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false));
+    end_ids_buf_   = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false));
 
     context_decoder_input_buf_  = (T*)(allocator_->reMalloc(
         context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));

From eaa0a170f514e564d15f47428b1bee9daf86025a Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:11:22 -0700
Subject: [PATCH 57/79] commit

---
 src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
index 2ddedd629..a263c9330 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
@@ -197,7 +197,7 @@ std::unique_ptr<AbstractTransformerModelInstance> LlamaTritonModel<T>::createMod
                      stream,
                      cublas_wrapper.get(),
                      allocator.get(),
-                     true,
+                     false, // is_free_buffer_after_forward
                      cuda_device_prop_ptr.get(),
                      attention_type,
                      int8_mode_,

From 580a7963d1100ff507b8ee3c57dbf696b98d3347 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:12:16 -0700
Subject: [PATCH 58/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index d08b892f5..ac4590b18 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -127,7 +127,7 @@ void Llama<T>::allocateBuffer(
     cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, true));
     finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, true));
     h_finished_buf_   = new bool[batchxbeam];
-    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, true));
+    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
     value_cache_ = key_cache_ + self_cache_size;
@@ -139,9 +139,9 @@ void Llama<T>::allocateBuffer(
 
     // prompt_learning weight batch ptrs
     prompt_learning_weight_batch_ =
-        (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, true));
+        (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
     tiled_prompt_lengths_buf_ =
-        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true));
+        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
 
     tiled_input_ids_buf_ =
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));

From 9255f7c46186282edb388c0dfde53c6e73de9ea5 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:12:58 -0700
Subject: [PATCH 59/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index ac4590b18..231f3bd18 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -127,7 +127,7 @@ void Llama<T>::allocateBuffer(
     cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, true));
     finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, true));
     h_finished_buf_   = new bool[batchxbeam];
-    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
+    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, true));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
     value_cache_ = key_cache_ + self_cache_size;

From debacbd6566effa153e83cbf136abd9b89bef7d5 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:13:39 -0700
Subject: [PATCH 60/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 231f3bd18..e7f8da12a 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -141,7 +141,7 @@ void Llama<T>::allocateBuffer(
     prompt_learning_weight_batch_ =
         (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
     tiled_prompt_lengths_buf_ =
-        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
+        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true));
 
     tiled_input_ids_buf_ =
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));

From 62e4177e400a8059f79c6780309e21847a388eb3 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:14:16 -0700
Subject: [PATCH 61/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index e7f8da12a..470c2a888 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -127,7 +127,7 @@ void Llama<T>::allocateBuffer(
     cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, true));
     finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, true));
     h_finished_buf_   = new bool[batchxbeam];
-    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, true));
+    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
     value_cache_ = key_cache_ + self_cache_size;

From 4f14e32311ccaf6090b0cd383d27144d26874564 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:15:57 -0700
Subject: [PATCH 62/79] commit

---
 examples/cpp/llama/llama_triton_example.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index d1f163146..5fbc56a1e 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -483,7 +483,6 @@ int main(int argc, char* argv[])
                         printf("%d,", hBuf[i]);
                     if ((i + 1) % (seq_len) == 0) {
                         printf("\n\n");
-                        break;
                     }
                     // if ((i + 1) % (seq_len) == 0 && i < 10)
                     //     std::cout << std::endl;

From 34b48e803306a4e7c46d5d95ab13b52bb8e6c0b2 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:18:24 -0700
Subject: [PATCH 63/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 470c2a888..23c5c4e4a 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -115,17 +115,17 @@ void Llama<T>::allocateBuffer(
     }
 
     input_attention_mask_ = (T*)(allocator_->reMalloc(
-        input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, true));
-    decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, true));
+        input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false));
+    decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
     decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, true));
+        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
     normed_decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, true));
-    logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, true));
+        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
     nccl_logits_buf_ =
-        (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, true));
-    cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, true));
-    finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, true));
+        (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
+    cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false));
+    finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false));
     h_finished_buf_   = new bool[batchxbeam];
     sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
 
@@ -141,7 +141,7 @@ void Llama<T>::allocateBuffer(
     prompt_learning_weight_batch_ =
         (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
     tiled_prompt_lengths_buf_ =
-        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true));
+        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
 
     tiled_input_ids_buf_ =
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));
@@ -365,7 +365,6 @@ Llama<T>::Llama(size_t                              head_num,
     int8_mode_(int8_mode),
     shared_contexts_ratio_(shared_contexts_ratio)
 {
-    printf("is_free_buffer_after_forward: %d\n", is_free_buffer_after_forward);
     int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
     if (std::is_same<half, T>::value) {
         local_vacab_size = ceil(local_vacab_size / 8.f) * 8;

From 596f6d978a29bd538b03156eca5f19e65340199b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:18:28 -0700
Subject: [PATCH 64/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 23c5c4e4a..c889f2db4 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -141,7 +141,7 @@ void Llama<T>::allocateBuffer(
     prompt_learning_weight_batch_ =
         (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
     tiled_prompt_lengths_buf_ =
-        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
+        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true));
 
     tiled_input_ids_buf_ =
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));

From 5772f09365288393f0c0ad54e66c42ee1c558467 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:19:07 -0700
Subject: [PATCH 65/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index c889f2db4..23c5c4e4a 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -141,7 +141,7 @@ void Llama<T>::allocateBuffer(
     prompt_learning_weight_batch_ =
         (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
     tiled_prompt_lengths_buf_ =
-        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true));
+        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
 
     tiled_input_ids_buf_ =
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));

From 96ccec9ef9a0b9f200b9f9b9309346d3dba38038 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:20:00 -0700
Subject: [PATCH 66/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 23c5c4e4a..c889f2db4 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -141,7 +141,7 @@ void Llama<T>::allocateBuffer(
     prompt_learning_weight_batch_ =
         (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
     tiled_prompt_lengths_buf_ =
-        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
+        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true));
 
     tiled_input_ids_buf_ =
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));

From 04f5ab2bfceb07be377446b74125c0e45019b7f6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:20:19 -0700
Subject: [PATCH 67/79] commit

---
 .vscode/settings.json | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 1ef97bcca..82000232b 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -68,21 +68,6 @@
         "future": "cpp",
         "cfenv": "cpp",
         "typeindex": "cpp",
-        "variant": "cpp",
-        "ios": "cpp",
-        "__bit_reference": "cpp",
-        "__config": "cpp",
-        "__debug": "cpp",
-        "__errc": "cpp",
-        "__hash_table": "cpp",
-        "__locale": "cpp",
-        "__mutex_base": "cpp",
-        "__node_handle": "cpp",
-        "__split_buffer": "cpp",
-        "__threading_support": "cpp",
-        "__tree": "cpp",
-        "__verbose_abort": "cpp",
-        "charconv": "cpp",
-        "locale": "cpp"
+        "variant": "cpp"
     }
 }

From c79afa9e350c2ae84f3bc534adb42bdff2a9c99b Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:37:10 -0700
Subject: [PATCH 68/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index c889f2db4..23c5c4e4a 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -141,7 +141,7 @@ void Llama<T>::allocateBuffer(
     prompt_learning_weight_batch_ =
         (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
     tiled_prompt_lengths_buf_ =
-        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true));
+        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
 
     tiled_input_ids_buf_ =
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));

From dbd5287ecb23e7bbcd8c95b316d417663450a37c Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:38:13 -0700
Subject: [PATCH 69/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 23c5c4e4a..debe3dbb1 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -640,6 +640,7 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     }
 
     // Prefix prompts
+    printf("has_prefix_prompt_: %d\n", has_prefix_prompt_);
     if (has_prefix_prompt_) {
         cudaMemcpyAsync(prompt_learning_weight_batch_,
                         prefix_prompt_weight_batch_ptrs.data(),

From 599e8dadc9e8969965351c18953aef50ec9804d6 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:42:27 -0700
Subject: [PATCH 70/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index debe3dbb1..6e44a4344 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -706,13 +706,13 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
             sync_check_cuda_error();
         }
 
-        invokeBuildDecoderAttentionMask(input_attention_mask_,
-                                        tiled_input_lengths_buf_,
-                                        tiled_prompt_lengths_buf_,
-                                        batch_size * beam_width,
-                                        max_input_length,
-                                        max_prefix_prompt_length,
-                                        stream_);
+        // invokeBuildDecoderAttentionMask(input_attention_mask_,
+        //                                 tiled_input_lengths_buf_,
+        //                                 tiled_prompt_lengths_buf_,
+        //                                 batch_size * beam_width,
+        //                                 max_input_length,
+        //                                 max_prefix_prompt_length,
+        //                                 stream_);
         sync_check_cuda_error();
 
         std::unordered_map<std::string, Tensor> decoder_input_tensors{

From 59f2c935c9262f9c1d198db128aa9b8c498c84e0 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:43:27 -0700
Subject: [PATCH 71/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 6e44a4344..7b7c5e706 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -713,7 +713,7 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         //                                 max_input_length,
         //                                 max_prefix_prompt_length,
         //                                 stream_);
-        sync_check_cuda_error();
+        // sync_check_cuda_error();
 
         std::unordered_map<std::string, Tensor> decoder_input_tensors{
             {"decoder_input",
@@ -837,15 +837,15 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
     }
 
-    invokeMaskPaddingTokens(masked_tokens_,
-                            input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
-                            tiled_prompt_lengths_buf_,
-                            max_cache_seq_len,
-                            max_input_length + max_prefix_prompt_length,
-                            0,
-                            batch_size,
-                            beam_width,
-                            stream_);
+    // invokeMaskPaddingTokens(masked_tokens_,
+    //                         input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
+    //                         tiled_prompt_lengths_buf_,
+    //                         max_cache_seq_len,
+    //                         max_input_length + max_prefix_prompt_length,
+    //                         0,
+    //                         batch_size,
+    //                         beam_width,
+    //                         stream_);
 
     for (int step = max_input_length; step < (int)max_output_seq_len; step++) {
         const int src_indir_idx = (step - max_input_length) % 2;

From f330f2e88f632dc5fa7e495dcfb542828514f0bd Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:45:12 -0700
Subject: [PATCH 72/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 7b7c5e706..aa4e14abd 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -837,15 +837,15 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
     }
 
-    // invokeMaskPaddingTokens(masked_tokens_,
-    //                         input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
-    //                         tiled_prompt_lengths_buf_,
-    //                         max_cache_seq_len,
-    //                         max_input_length + max_prefix_prompt_length,
-    //                         0,
-    //                         batch_size,
-    //                         beam_width,
-    //                         stream_);
+    invokeMaskPaddingTokens(masked_tokens_,
+                            input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
+                            tiled_prompt_lengths_buf_,
+                            max_cache_seq_len,
+                            max_input_length + max_prefix_prompt_length,
+                            0,
+                            batch_size,
+                            beam_width,
+                            stream_);
 
     for (int step = max_input_length; step < (int)max_output_seq_len; step++) {
         const int src_indir_idx = (step - max_input_length) % 2;

From 3ef5d241313e5c75aa2040bd95971b7ee1c6c7cb Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:46:09 -0700
Subject: [PATCH 73/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index aa4e14abd..debe3dbb1 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -706,14 +706,14 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
             sync_check_cuda_error();
         }
 
-        // invokeBuildDecoderAttentionMask(input_attention_mask_,
-        //                                 tiled_input_lengths_buf_,
-        //                                 tiled_prompt_lengths_buf_,
-        //                                 batch_size * beam_width,
-        //                                 max_input_length,
-        //                                 max_prefix_prompt_length,
-        //                                 stream_);
-        // sync_check_cuda_error();
+        invokeBuildDecoderAttentionMask(input_attention_mask_,
+                                        tiled_input_lengths_buf_,
+                                        tiled_prompt_lengths_buf_,
+                                        batch_size * beam_width,
+                                        max_input_length,
+                                        max_prefix_prompt_length,
+                                        stream_);
+        sync_check_cuda_error();
 
         std::unordered_map<std::string, Tensor> decoder_input_tensors{
             {"decoder_input",

From 8e57eb5e270ecaefb39ad97f67421083d03e3aa1 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:50:06 -0700
Subject: [PATCH 74/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index debe3dbb1..172170c75 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -640,7 +640,6 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     }
 
     // Prefix prompts
-    printf("has_prefix_prompt_: %d\n", has_prefix_prompt_);
     if (has_prefix_prompt_) {
         cudaMemcpyAsync(prompt_learning_weight_batch_,
                         prefix_prompt_weight_batch_ptrs.data(),
@@ -837,6 +836,21 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
     }
 
+    {
+        
+        int* buf;
+        int st = batch_size * beam_width;
+        buf = new int[st];
+        cudaMemcpy(buf, tiled_prompt_lengths_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
+        {
+            printf("tiled_prompt_lengths_buf_:\n");
+            for (int i=0; i<st; i++) {
+                printf("%d ", (buf[i]));
+            }
+            printf("buf last: %f\n", double(buf[st-1]));
+            printf("\n");
+        }
+    }
     invokeMaskPaddingTokens(masked_tokens_,
                             input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
                             tiled_prompt_lengths_buf_,

From 3e502435b3451ecbd76d7ebebe05bbfa89c60d71 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:52:12 -0700
Subject: [PATCH 75/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 172170c75..fb6045104 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -853,7 +853,6 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     }
     invokeMaskPaddingTokens(masked_tokens_,
                             input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
-                            tiled_prompt_lengths_buf_,
                             max_cache_seq_len,
                             max_input_length + max_prefix_prompt_length,
                             0,

From 87cfd581fb2d30b0bc05ea48da9780f5b3a4a42f Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:55:44 -0700
Subject: [PATCH 76/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index fb6045104..1cc96b073 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -704,6 +704,7 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                                      stream_);
             sync_check_cuda_error();
         }
+        printf("invokeBuildDecoderAttentionMask\n");
 
         invokeBuildDecoderAttentionMask(input_attention_mask_,
                                         tiled_input_lengths_buf_,

From 09b5f4501ce6334ad72fda4895867e699d081364 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:58:55 -0700
Subject: [PATCH 77/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 1cc96b073..eabb0f217 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -704,11 +704,10 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                                      stream_);
             sync_check_cuda_error();
         }
-        printf("invokeBuildDecoderAttentionMask\n");
 
         invokeBuildDecoderAttentionMask(input_attention_mask_,
                                         tiled_input_lengths_buf_,
-                                        tiled_prompt_lengths_buf_,
+                                        nullptr, // prefix_prompt_lengths
                                         batch_size * beam_width,
                                         max_input_length,
                                         max_prefix_prompt_length,
@@ -837,21 +836,6 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
     }
 
-    {
-        
-        int* buf;
-        int st = batch_size * beam_width;
-        buf = new int[st];
-        cudaMemcpy(buf, tiled_prompt_lengths_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost);
-        {
-            printf("tiled_prompt_lengths_buf_:\n");
-            for (int i=0; i<st; i++) {
-                printf("%d ", (buf[i]));
-            }
-            printf("buf last: %f\n", double(buf[st-1]));
-            printf("\n");
-        }
-    }
     invokeMaskPaddingTokens(masked_tokens_,
                             input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
                             max_cache_seq_len,

From 407a8684eae2d84599457120cafae12328bfce36 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 22:59:37 -0700
Subject: [PATCH 78/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index eabb0f217..ad390d551 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -707,7 +707,7 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 
         invokeBuildDecoderAttentionMask(input_attention_mask_,
                                         tiled_input_lengths_buf_,
-                                        nullptr, // prefix_prompt_lengths
+                                        (const int*)nullptr, // prefix_prompt_lengths
                                         batch_size * beam_width,
                                         max_input_length,
                                         max_prefix_prompt_length,

From 2d7be1a88e4b1a63917f0a14c6b73125df1c2e80 Mon Sep 17 00:00:00 2001
From: sfc-gh-zhwang <flex.wang@snowflake.com>
Date: Mon, 30 Oct 2023 23:02:46 -0700
Subject: [PATCH 79/79] commit

---
 src/fastertransformer/models/llama/Llama.cc | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 64a7cf5f0..01ebc0e48 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -140,8 +140,6 @@ void Llama<T>::allocateBuffer(
     // prompt_learning weight batch ptrs
     prompt_learning_weight_batch_ =
         (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
-    tiled_prompt_lengths_buf_ =
-        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true));
 
     tiled_input_ids_buf_ =
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));
@@ -204,7 +202,6 @@ void Llama<T>::freeBuffer()
         }
 
         allocator_->free((void**)(&prompt_learning_weight_batch_));
-        allocator_->free((void**)(&tiled_prompt_lengths_buf_));
 
         allocator_->free((void**)(&tiled_input_ids_buf_));
         allocator_->free((void**)(&tiled_input_lengths_buf_));
@@ -639,22 +636,6 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
     }
 
-    // Prefix prompts
-    if (has_prefix_prompt_) {
-        cudaMemcpyAsync(prompt_learning_weight_batch_,
-                        prefix_prompt_weight_batch_ptrs.data(),
-                        sizeof(T*) * batch_size * beam_width,
-                        cudaMemcpyDefault,
-                        stream_);
-        cudaMemcpyAsync(tiled_prompt_lengths_buf_,
-                        prefix_prompt_lengths.data(),
-                        sizeof(int) * batch_size * beam_width,
-                        cudaMemcpyDefault,
-                        stream_);
-    }
-
-    sync_check_cuda_error();
-
     // handle first step
     if (has_prefix_prompt_ || has_prefix_soft_prompt_ || max_input_length > 1) {
         invokeTileGptInputs(tiled_input_ids_buf_,