From 743369a60a6054257983375ac5008e3f19129032 Mon Sep 17 00:00:00 2001 From: Asim Shankar Date: Mon, 10 Jul 2023 19:24:07 -0700 Subject: [PATCH 01/79] Merge with main (#1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update beam_search_topk_kernels.cu fix: fix bug of beam search * fix: change int of some kernels to int64_t to prevent overflow * fix: gpt tensor shapes inconsistency (#505) Signed-off-by: AkiyamaYummy <842720660@qq.com> * Update gpt_guide.md (#529) * fix: fix bug of gpt buffer and gpt gemm overflow * Update T5DecodingWeight.cc fix: fix loading bug of t5 * [Enhancement]add pytorch backend support for gptneox (#550) * add pytorch backend support for gptneox Signed-off-by: AkiyamaYummy <842720660@qq.com> * fix early stopping invalid * 1) Some unused parameters and logic have been removed. 2) Revisions that would affect pipeline parallelism have been reverted. 3) The code has been made capable of direct validation on TabbyML/NeoX-1.3B. Signed-off-by: AkiyamaYummy <842720660@qq.com> * Change the names of classes, removing 'parallel' from their names Signed-off-by: AkiyamaYummy <842720660@qq.com> * Format the code. Signed-off-by: AkiyamaYummy <842720660@qq.com> * Only print results when rank is 0. Signed-off-by: AkiyamaYummy <842720660@qq.com> * Add dist.init_process_group(). Signed-off-by: AkiyamaYummy <842720660@qq.com> * update docs Signed-off-by: AkiyamaYummy <842720660@qq.com> --------- Signed-off-by: AkiyamaYummy <842720660@qq.com> * Update cublasMMWrapper.cc Fix the CUBLAS_VERSION checking of cublasMMWrapper * Update cublasMMWrapper.cc * fix overflow in softmax_kernel when process long seqlen and big batch_size (#524) * Update unfused_attention_kernels.cu fix bug of softmax kernel * [Enhancement]create huggingface_gptneox_convert.py (#569) * create huggingface_gptneox_convert.py Signed-off-by: AkiyamaYummy <842720660@qq.com> * adjust HF's multi bin files Signed-off-by: AkiyamaYummy <842720660@qq.com> * update gptneox_guide.md Signed-off-by: AkiyamaYummy <842720660@qq.com> --------- Signed-off-by: AkiyamaYummy <842720660@qq.com> * perf(bloom): improve performance of huggingface_bloom_convert.py, decrease the time cost and the mem using (#568) Co-authored-by: r.yang * Fix/gpt early stop (#584) * fix: fix bug of early stopping of gpt * [bugfix] Fix 2-shot All Reduce correctness issue (indexing bug). (#672) FasterTransformer 2-shot all reduce is implemented as a reduce-scatter + all-gather. There is an indexing bug in the all-gather step. Prior to this change, 2-shot all reduce was only producing correct results on device 0. Now, all devices have the correct results. * fix: swap tensor bug (#683) * Support size_per_head=112 (#660) * fix multi-gpu build * add support for size_per_head=112 for gpt decoder * remove mpi_cxx from multi-gpu build for now (#705) --------- Signed-off-by: AkiyamaYummy <842720660@qq.com> Co-authored-by: byshiue Co-authored-by: _yummy_ <842720660@qq.com> Co-authored-by: Ying Sheng Co-authored-by: zhangxin81 <115389973+zhangxin81@users.noreply.github.com> Co-authored-by: 杨睿 <595403043@qq.com> Co-authored-by: r.yang Co-authored-by: Rahul Kindi Co-authored-by: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com> Co-authored-by: Daya Khudia <37562707+dskhudia@users.noreply.github.com> Co-authored-by: Dean Wyatte <2512762+dwyatte@users.noreply.github.com> --- README.md | 4 + docs/gpt_guide.md | 2 +- docs/gptneox_guide.md | 56 ++- .../cpp/multi_gpu_gpt/gpt_example_utils.cc | 2 +- .../gpt/utils/huggingface_bloom_convert.py | 192 ++++++++-- examples/pytorch/gptneox/gptneox_example.py | 226 ++++++++++++ examples/pytorch/gptneox/utils/gptneox.py | 317 ++++++++++++++++ .../utils/huggingface_gptneox_convert.py | 251 +++++++++++++ .../kernels/beam_search_topk_kernels.cu | 4 +- .../kernels/custom_ar_kernels.cu | 4 +- .../decoder_masked_multihead_attention.cu | 3 + .../decoder_masked_multihead_attention_112.cu | 101 +++++ .../kernels/decoding_kernels.cu | 48 +-- src/fastertransformer/kernels/gpt_kernels.cu | 48 ++- src/fastertransformer/kernels/gpt_kernels.h | 1 + .../kernels/stop_criteria_kernels.cu | 2 +- .../kernels/unfused_attention_kernels.cu | 19 +- .../layers/TensorParallelGeluFfnLayer.cc | 2 + .../layers/TensorParallelReluFfnLayer.cc | 2 + .../layers/TensorParallelSiluFfnLayer.cc | 2 + .../DecoderCrossAttentionLayer.cu | 6 +- .../DecoderSelfAttentionLayer.cc | 4 +- ...ensorParallelDecoderCrossAttentionLayer.cc | 4 +- ...TensorParallelDecoderSelfAttentionLayer.cc | 4 +- ...ensorParallelDisentangledAttentionLayer.cc | 4 +- .../TensorParallelGptContextAttentionLayer.cc | 4 +- .../TensorParallelUnfusedAttentionLayer.cc | 4 +- src/fastertransformer/models/bert/Bert.cc | 4 +- .../gptneox/GptNeoXDecoderLayerWeight.h | 2 +- .../models/gptneox/GptNeoXWeight.cc | 10 + .../models/gptneox/GptNeoXWeight.h | 7 + .../models/multi_gpu_gpt/ParallelGpt.cc | 109 +++--- .../models/multi_gpu_gpt/ParallelGpt.h | 7 +- .../models/t5/T5DecodingWeight.cc | 8 +- src/fastertransformer/th_op/CMakeLists.txt | 3 + src/fastertransformer/th_op/common/GptOps.cc | 1 + .../th_op/gptneox/CMakeLists.txt | 17 + .../th_op/gptneox/GptNeoXOp.cc | 164 +++++++++ .../th_op/gptneox/GptNeoXOp.h | 346 ++++++++++++++++++ .../utils/cublasMMWrapper.cc | 4 +- .../utils/gemm_test/gpt_gemm_func.cc | 16 +- .../utils/gemm_test/gpt_gemm_func.h | 16 +- ...used_self_multihead_attention_unit_test.py | 6 +- tests/unittests/test_gpt_kernels.cu | 1 + 44 files changed, 1867 insertions(+), 170 deletions(-) create mode 100644 examples/pytorch/gptneox/gptneox_example.py create mode 100755 examples/pytorch/gptneox/utils/gptneox.py create mode 100644 examples/pytorch/gptneox/utils/huggingface_gptneox_convert.py create mode 100644 src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_112.cu create mode 100755 src/fastertransformer/th_op/gptneox/CMakeLists.txt create mode 100755 src/fastertransformer/th_op/gptneox/GptNeoXOp.cc create mode 100755 src/fastertransformer/th_op/gptneox/GptNeoXOp.h diff --git a/README.md b/README.md index a82098cd4..a00e0d631 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,7 @@ FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide | Swin Transformer | TensorRT | Yes | Yes | - | - | - | - | | ViT | PyTorch | Yes | Yes | - | - | - | - | | ViT | TensorRT | Yes | Yes | - | - | - | - | +| GPT-NeoX | PyTorch | Yes | - | - | Yes | Yes | - | | GPT-NeoX | Triton backend | Yes | - | - | Yes | Yes | - | | BART/mBART | PyTorch | Yes | - | - | Yes | Yes | - | | WeNet | C++ | Yes | - | - | - | - | - | @@ -212,6 +213,9 @@ In the experiments of decoding, we updated the following parameters: ### Changelog +May 2023 +- Fix bugs of generation early stopping + January 2023 - Support GPT MoE - Support FP8 for Bert and GPT (**Experimental**) diff --git a/docs/gpt_guide.md b/docs/gpt_guide.md index 4be09d411..4a10c1d46 100644 --- a/docs/gpt_guide.md +++ b/docs/gpt_guide.md @@ -458,7 +458,7 @@ python ../examples/pytorch/gpt/utils/huggingface_gpt_convert.py -i gpt2-xl/ -o . 2. Run GPT on PyTorch - Basically, `gpt_example.py` includes the example how to declare a model, load a ckeckpoint, and forward context inputs and get generated outputs in Pytorch. + Basically, `gpt_example.py` includes the example how to declare a model, load a checkpoint, and forward context inputs and get generated outputs in Pytorch. For generating outputs based on context inputs, create a text file including the context inputs (line by line) and set `--sample_file_input` to the text file path. (By default, the script will generate outputs without context inputs.) Set `--sample_file_output` to write the outputs to a file. Use `--data_type fp16/bf16` to run in FP16 or BF16. diff --git a/docs/gptneox_guide.md b/docs/gptneox_guide.md index 4a443fae8..dcedbe8ed 100644 --- a/docs/gptneox_guide.md +++ b/docs/gptneox_guide.md @@ -36,6 +36,7 @@ We provide the environment variables to tune for specific usage. * Checkpoint converter * EleutherAI + * HuggingFace * Data type * FP32 * FP16 @@ -46,7 +47,7 @@ We provide the environment variables to tune for specific usage. * Bad words list * Beam search and sampling are both supported -## Setup +## Setup from EleutherAI checkpoint ### Requirements @@ -72,6 +73,22 @@ You may download the tokenizer config [here](https://mystic.the-eye.eu/public/AI To tokenize/detokenize files, use the script found in `examples/pytorch/gptneox/utils/hftokenizer.py`. You may need to pass the path to the tokenizer config with the `--tokenizer` flag. +## Setup from HuggingFace checkpoint + +> Please checkout https://huggingface.co/docs to learn more about the usage of the huggingface models and tokenizers. + +First download a huggingface checkpoint: + +```bash +git lfs clone https://huggingface.co// +``` + +Then use the script provided by FasterTransformer to convert the checkpoint to raw weights, understood by FT. You can change `-i_g` to specify the tensor parallelism size. + +```bash +python ../examples/pytorch/gptneox/utils/huggingface_gptneox_convert.py -i ../path/to/your/model -o ../../path/to/fastertransformer/model -i_g 1 -m_n gptneox +``` + ### Run GPT-NeoX * Generate the `gemm_config.in` file.\ @@ -89,14 +106,39 @@ To tokenize/detokenize files, use the script found in `examples/pytorch/gptneox/ mpirun -n 2 --allow-run-as-root ./bin/gptneox_example ``` -E.g. by setting the `data_type` of `gptneox_config.ini` to `fp16`, users can run gpt model under fp16. + E.g. by setting the `data_type` of `gptneox_config.ini` to `fp16`, users can run gpt model under fp16. + + You can then decode the `out` file with the tokenizer: -You can then decode the `out` file with the tokenizer: + ```bash + wget https://mystic.the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json + ../examples/pytorch/gptneox/utils/hftokenizer.py out --tokenizer 20B_tokenizer.json + ``` + +* Run GPT on PyTorch + + Basically, `gptneox_example.py` includes the example how to declare a model, load a checkpoint, and forward context inputs and get generated outputs in Pytorch. + + For generating outputs based on context inputs, create a text file including the context inputs (line by line) and set `--sample_input_file` to the text file path. (By default, the script will generate outputs without context inputs.) + + Run with `-h` to see more settings. + + Run GPT with TP and PP on single node. Note that the number of processes must equal to `tensor_para_size * pipeline_para_size`. + + ```bash + # No parallelism (tensor_para_size=1, pipeline_para_size=1) + python ../examples/pytorch/gptneox/gptneox_example.py + + # TP (tensor_para_size=2, pipeline_para_size=1) + mpirun -n 2 --allow-run-as-root python ../examples/pytorch/gptneox/gptneox_example.py --tensor_para_size=2 --pipeline_para_size=1 --ckpt_path="/path/to/your/model/2-gpu" + + # LP (tensor_para_size=1, pipeline_para_size=2) + mpirun -n 2 --allow-run-as-root python ../examples/pytorch/gptneox/gptneox_example.py --tensor_para_size=1 --pipeline_para_size=2 --ckpt_path="/path/to/your/model/1-gpu" + + # TP and LP (tensor_para_size=2, pipeline_para_size=2) + mpirun -n 4 --allow-run-as-root python ../examples/pytorch/gptneox/gptneox_example.py --tensor_para_size=2 --pipeline_para_size=2 --ckpt_path="/path/to/your/model/2-gpu" + ``` - ```bash - wget https://mystic.the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json - ../examples/pytorch/gptneox/utils/hftokenizer.py out --tokenizer 20B_tokenizer.json - ``` no prompts, 1 --> from loaded prompts, 2 --> from request prompts template -__global__ void embeddingLookupPosEncoding(T* from_tensor, - const T* embedding_table, - const T* position_encoding, - const int* all_ids, - const int* padding_count, - const int* input_lengths, - const int local_token_num, - const int hidden_units, - const int step, - const int max_input_length, - const int token_num, - const int ite, - const T scale) +__global__ void embeddingLookupPosEncoding(T* from_tensor, + const T* embedding_table, + const T* position_encoding, + const int* all_ids, + const int* padding_count, + const int* input_lengths, + const int local_token_num, + const int64_t hidden_units, + const int step, + const int max_input_length, + const int token_num, + const int ite, + const T scale) { // 1. lookup from embedding table // 2. multiply scale @@ -120,7 +120,7 @@ __global__ void embeddingLookupPosEncoding(T* from_tensor, const bool use_padding_count = padding_count != nullptr; const bool use_input_len = input_lengths != nullptr; - for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_token_num * hidden_units; + for (int64_t index = blockIdx.x * blockDim.x + threadIdx.x; index < local_token_num * hidden_units; index += blockDim.x * gridDim.x) { const int row_index = index / hidden_units; const int col_index = index % hidden_units; @@ -148,7 +148,7 @@ __global__ void embeddingLookup(T* from_tensor, const int* all_ids, pPromptTuningParam prompt_param, const int local_token_num, - const int hidden_units, + const int64_t hidden_units, const int step, const int token_num, const int ite, @@ -159,7 +159,7 @@ __global__ void embeddingLookup(T* from_tensor, // 2. multiply scale const int id_offset = step * token_num + ite * local_token_num; - for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_token_num * hidden_units; + for (int64_t index = blockIdx.x * blockDim.x + threadIdx.x; index < local_token_num * hidden_units; index += blockDim.x * gridDim.x) { const int word_index = index / hidden_units; @@ -313,15 +313,15 @@ INSTANTIATE_LOOKUP_POS_ENCODING_PAD_COUNT(__nv_bfloat16); #undef INSTANTIATE_LOOKUP_POS_ENCODING_PAD_COUNT template -__global__ void paddingEmbedding(T* padded_embedding_kernel, - T* padded_embedding_bias, - const T* embedding_kernel, - const T* embedding_bias, - const int hidden_unit, - const int vocab_size, - const int vocab_size_padded) +__global__ void paddingEmbedding(T* padded_embedding_kernel, + T* padded_embedding_bias, + const T* embedding_kernel, + const T* embedding_bias, + const int64_t hidden_unit, + const int64_t vocab_size, + const int64_t vocab_size_padded) { - for (int id = threadIdx.x + blockIdx.x * blockDim.x; id < hidden_unit * vocab_size_padded; + for (int64_t id = threadIdx.x + blockIdx.x * blockDim.x; id < hidden_unit * vocab_size_padded; id += blockDim.x * gridDim.x) { int row_id = id / vocab_size_padded; int col_id = id % vocab_size_padded; diff --git a/src/fastertransformer/kernels/gpt_kernels.cu b/src/fastertransformer/kernels/gpt_kernels.cu index abb3b5db4..7dc9af620 100644 --- a/src/fastertransformer/kernels/gpt_kernels.cu +++ b/src/fastertransformer/kernels/gpt_kernels.cu @@ -39,7 +39,7 @@ __global__ void start_id_embedding_position_lookups_kernel(T* const int length, const int max_length, const int batch_size, - const int hidden_units) + const int64_t hidden_units) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * length * hidden_units; index += blockDim.x * gridDim.x) { @@ -250,20 +250,20 @@ __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLo const int beam_id = tmp_index % param.beam_width; tmp_index = (tmp_index - beam_id) / param.beam_width; const int batch_id = tmp_index % param.batch_size; + const int64_t hidden_units = param.hidden_units; T embedding = (seq_id < param.prefix_soft_prompt_lengths[batch_id]) ? - (T)param - .prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * param.hidden_units - + seq_id * param.hidden_units + hidden_id] : - param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length + (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units + + seq_id * hidden_units + hidden_id] : + param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length + beam_id * param.max_input_length + (seq_id - param.prefix_soft_prompt_lengths[batch_id])] - * param.hidden_units + * hidden_units + hidden_id]; T pos_embed = param.pos_table == nullptr ? (T)0.0f : - param.pos_table[(param.start_step + seq_id - 1) * param.hidden_units + hidden_id]; + param.pos_table[(param.start_step + seq_id - 1) * hidden_units + hidden_id]; param.from_tensor[index] = embedding + pos_embed; if (seq_id == 0 && hidden_id == 0) { @@ -640,6 +640,7 @@ __global__ void generate_dups_indices(int* batch_to_compact, int* compact_size, const int* shared_contexts, const size_t batch_size, + const size_t beam_width, const size_t input_seq_len) { const int padded_batchsize = blockDim.x * ((batch_size + blockDim.x - 1) / blockDim.x); @@ -649,20 +650,23 @@ __global__ void generate_dups_indices(int* batch_to_compact, __shared__ int scan_offset; int scan = 0; - for (int batch = threadIdx.x; batch < padded_batchsize; batch += blockDim.x) { - bool masked = (batch >= batch_size); - bool first_iter = batch < blockDim.x; + for (int seq_idx = threadIdx.x; seq_idx < padded_batchsize; seq_idx += blockDim.x) { + bool masked = (seq_idx >= batch_size); + bool first_iter = seq_idx < blockDim.x; - int is_first_occur = masked ? 0 : shared_contexts[batch] == batch; + int is_first_occur = masked ? 0 : shared_contexts[seq_idx] == seq_idx; BlockScan(temp_storage).ExclusiveSum(is_first_occur, scan); if (!masked && is_first_occur) { int compact_idx = scan + (first_iter ? 0 : scan_offset); // Context rep. writes initial index - batch_to_compact[batch] = compact_idx; - compact_to_batch[compact_idx] = batch; + batch_to_compact[seq_idx * beam_width] = compact_idx; + // input ids are tiled in context part + compact_to_batch[compact_idx] = seq_idx * beam_width; } + __syncthreads(); + if (threadIdx.x == blockDim.x - 1) { scan_offset = scan + is_first_occur + (first_iter ? 0 : scan_offset); } @@ -671,8 +675,15 @@ __global__ void generate_dups_indices(int* batch_to_compact, if (!masked && !is_first_occur) { // Fill the rest of batch_to_compact based on what rep. wrote - const int src_idx = batch_to_compact[shared_contexts[batch]]; - batch_to_compact[batch] = src_idx; + const int src_idx = batch_to_compact[shared_contexts[seq_idx] * beam_width]; + batch_to_compact[seq_idx * beam_width] = src_idx; + } + + if (!masked) { + // set same compact idx for beams + for (int beam_id = 1; beam_id < beam_width; ++beam_id) { + batch_to_compact[seq_idx * beam_width + beam_id] = batch_to_compact[seq_idx * beam_width]; + } } } @@ -696,14 +707,17 @@ void invokeFindContextDups(int* shared_contexts, int* compact_size, const int* input_ids, const size_t batch_size, + const size_t beam_width, const size_t input_seq_len, cudaStream_t stream) { dim3 block{512}; dim3 grid{((int)batch_size + block.x - 1) / block.x}; + // set shared_context[i] = i init_shared_contexts<<>>(shared_contexts, batch_size); grid = dim3{(unsigned int)(batch_size * (batch_size - 1)) / 2}; + // set shared_contexts[i] = j, where j = min{k, such that input_ids[k] == input_ids[i]} if (input_seq_len <= 128) { block = 128; find_context_dups<128><<>>(shared_contexts, input_ids, batch_size, input_seq_len); @@ -713,8 +727,10 @@ void invokeFindContextDups(int* shared_contexts, find_context_dups<256><<>>(shared_contexts, input_ids, batch_size, input_seq_len); } + // set batch_to_compact[i] = j, where j is the position of input_ids[i] in the compact_batch + // set compact_to_batch[i] = j, where j is such that compact_to_batch[i] = input_ids[j] generate_dups_indices<<<1, DUPS_INDICES_BLOCK_SIZE, 0, stream>>>( - batch_to_compact, compact_to_batch, compact_size, shared_contexts, batch_size, input_seq_len); + batch_to_compact, compact_to_batch, compact_size, shared_contexts, batch_size, beam_width, input_seq_len); } template diff --git a/src/fastertransformer/kernels/gpt_kernels.h b/src/fastertransformer/kernels/gpt_kernels.h index 617f9bc05..d78224e0a 100644 --- a/src/fastertransformer/kernels/gpt_kernels.h +++ b/src/fastertransformer/kernels/gpt_kernels.h @@ -127,6 +127,7 @@ void invokeFindContextDups(int* shared_contexts, int* compact_size, const int* input_ids, const size_t batch_size, + const size_t beam_width, const size_t input_seq_len, cudaStream_t stream = 0); diff --git a/src/fastertransformer/kernels/stop_criteria_kernels.cu b/src/fastertransformer/kernels/stop_criteria_kernels.cu index 5d6611153..a8d4b98fa 100644 --- a/src/fastertransformer/kernels/stop_criteria_kernels.cu +++ b/src/fastertransformer/kernels/stop_criteria_kernels.cu @@ -150,7 +150,7 @@ void invokeLengthCriterion(bool* finished, length_criterion<<>>( finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step); - while (((volatile size_t*)h_pinned_finished_sum_)[0] == -1) {}; + while (((volatile int*)h_pinned_finished_sum_)[0] == -1) {}; sync_check_cuda_error(); *should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width; diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 90e8b8029..d0fb0a197 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -268,23 +268,23 @@ __global__ void softmax_kernel(T* attn_score, // attn_mask, [batch_size, q_length, k_length] // linear_bias_slopes, [num_heads] - const int bi = blockIdx.y; // Batch index. - const int hi = blockIdx.z; // Head index. + const int64_t bi = blockIdx.y; // Batch index. + const int64_t hi = blockIdx.z; // Head index. __shared__ float s_mean, s_max; const float linear_bias_slope = linear_bias_slopes != nullptr ? (float)linear_bias_slopes[hi] : 0.0f; // Loop along with Q dimension. - for (int qi = blockIdx.x; qi < q_length; qi += gridDim.x) { + for (int64_t qi = blockIdx.x; qi < q_length; qi += gridDim.x) { float data[ITEMS_PER_THREAD]; - int qk_offset; + int64_t qk_offset; float local_max = -1e20f; // Loop along with K dimension. - for (int i = 0; blockDim.x * i + threadIdx.x < k_length; i++) { - int ki = blockDim.x * i + threadIdx.x; // Index of K dimension. + for (int64_t i = 0; blockDim.x * i + threadIdx.x < k_length; i++) { + int64_t ki = blockDim.x * i + threadIdx.x; // Index of K dimension. qk_offset = ((bi * head_num + hi) * q_length + qi) * k_length + ki; float qk_val = static_cast(qk[qk_offset]); @@ -297,7 +297,7 @@ __global__ void softmax_kernel(T* attn_score, qk_bias += static_cast(linear_bias_slope * (ki - qi)); } - int mask_offset = (bi * q_length + qi) * k_length + ki; + int64_t mask_offset = (bi * q_length + qi) * k_length + ki; float mask_val = static_cast(ldg(&attn_mask[mask_offset])); qk_bias += (1.0f - mask_val) * -10000.0f; @@ -312,7 +312,7 @@ __global__ void softmax_kernel(T* attn_score, __syncthreads(); float local_sum = 0; - for (int i = 0; blockDim.x * i + threadIdx.x < k_length; i++) { + for (int64_t i = 0; blockDim.x * i + threadIdx.x < k_length; i++) { data[i] = __expf(data[i] - s_max); local_sum += data[i]; } @@ -324,7 +324,7 @@ __global__ void softmax_kernel(T* attn_score, } __syncthreads(); - for (int i = 0; blockDim.x * i + threadIdx.x < k_length; i++) { + for (int64_t i = 0; blockDim.x * i + threadIdx.x < k_length; i++) { qk_offset = ((bi * head_num + hi) * q_length + qi) * k_length + blockDim.x * i + threadIdx.x; attn_score[qk_offset] = (T)(data[i] * s_mean); } @@ -602,6 +602,7 @@ __global__ void softmax_kernel_h2_v2(T* attn_score, #define LAUNCH_MAKSED_SOFTMAX_(T_, ITEMS_PER_THREAD) \ block.x /= ITEMS_PER_THREAD; \ + block.x = (block.x + 31) / 32 * 32; \ assert(block.x <= 1024); \ if (is_half2) { \ if (grid.x % 4 == 0) { \ diff --git a/src/fastertransformer/layers/TensorParallelGeluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelGeluFfnLayer.cc index 1dda95b6d..fb78d5b3d 100644 --- a/src/fastertransformer/layers/TensorParallelGeluFfnLayer.cc +++ b/src/fastertransformer/layers/TensorParallelGeluFfnLayer.cc @@ -45,6 +45,7 @@ void TensorParallelGeluFfnLayer::forward(TensorMap* output_tensors, if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) { use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&swap_tensors, token_num * hidden_units); + output_tensors->at("ffn_output").data = swap_tensors[0].data; } GeluFfnLayer::forward(output_tensors, input_tensors, ffn_weights); @@ -57,6 +58,7 @@ void TensorParallelGeluFfnLayer::forward(TensorMap* output_tensors, } else { custom_all_reduce_comm_->customAllReduce(token_num * hidden_units, GeluFfnLayer::stream_); + output_tensors->at("ffn_output").data = swap_tensors[0].data; } sync_check_cuda_error(); } diff --git a/src/fastertransformer/layers/TensorParallelReluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelReluFfnLayer.cc index 29ac2846e..e8646c7d1 100644 --- a/src/fastertransformer/layers/TensorParallelReluFfnLayer.cc +++ b/src/fastertransformer/layers/TensorParallelReluFfnLayer.cc @@ -45,6 +45,7 @@ void TensorParallelReluFfnLayer::forward(TensorMap* output_tensors, if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) { use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&swap_tensors, token_num * hidden_units); + output_tensors->at("ffn_output").data = swap_tensors[0].data; } ReluFfnLayer::forward(output_tensors, input_tensors, ffn_weights); @@ -57,6 +58,7 @@ void TensorParallelReluFfnLayer::forward(TensorMap* output_tensors, } else { custom_all_reduce_comm_->customAllReduce(token_num * hidden_units, ReluFfnLayer::stream_); + output_tensors->at("ffn_output").data = swap_tensors[0].data; } sync_check_cuda_error(); } diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc index 25a2da86b..bfc781cc4 100644 --- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc +++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc @@ -44,6 +44,7 @@ void TensorParallelSiluFfnLayer::forward(TensorMap* output_tensors, if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) { use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&swap_tensors, token_num * hidden_units); + output_tensors->at("ffn_output").data = swap_tensors[0].data; } SiluFfnLayer::forward(output_tensors, input_tensors, ffn_weights); @@ -55,6 +56,7 @@ void TensorParallelSiluFfnLayer::forward(TensorMap* output_tensors, } else { custom_all_reduce_comm_->customAllReduce(token_num * hidden_units, SiluFfnLayer::stream_); + output_tensors->at("ffn_output").data = swap_tensors[0].data; } sync_check_cuda_error(); } diff --git a/src/fastertransformer/layers/attention_layers/DecoderCrossAttentionLayer.cu b/src/fastertransformer/layers/attention_layers/DecoderCrossAttentionLayer.cu index 7d022c4f4..55c4d9071 100644 --- a/src/fastertransformer/layers/attention_layers/DecoderCrossAttentionLayer.cu +++ b/src/fastertransformer/layers/attention_layers/DecoderCrossAttentionLayer.cu @@ -796,8 +796,8 @@ DecoderCrossAttentionLayer::DecoderCrossAttentionLayer(size_t max_b q_scaling_(q_scaling) { FT_CHECK(size_per_head_ == 32 || size_per_head_ == 48 || size_per_head_ == 64 || size_per_head_ == 80 - || size_per_head_ == 96 || size_per_head_ == 128 || size_per_head_ == 144 || size_per_head_ == 160 - || size_per_head_ == 192 || size_per_head_ == 224 || size_per_head_ == 256); + || size_per_head_ == 96 || size_per_head_ == 112 || size_per_head_ == 128 || size_per_head_ == 144 + || size_per_head_ == 160 || size_per_head_ == 192 || size_per_head_ == 224 || size_per_head_ == 256); } template @@ -1030,4 +1030,4 @@ template class DecoderCrossAttentionLayer; template class DecoderCrossAttentionLayer<__nv_bfloat16>; #endif -} // namespace fastertransformer \ No newline at end of file +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc index 7ff426128..44fed478b 100644 --- a/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc @@ -278,8 +278,8 @@ DecoderSelfAttentionLayer::DecoderSelfAttentionLayer(size_t max_bat int8_mode_(int8_mode) { FT_CHECK(size_per_head_ == 32 || size_per_head_ == 48 || size_per_head_ == 64 || size_per_head_ == 80 - || size_per_head_ == 96 || size_per_head_ == 128 || size_per_head_ == 144 || size_per_head_ == 160 - || size_per_head_ == 192 || size_per_head_ == 224 || size_per_head_ == 256); + || size_per_head_ == 96 || size_per_head_ == 112 || size_per_head_ == 128 || size_per_head_ == 144 + || size_per_head_ == 160 || size_per_head_ == 192 || size_per_head_ == 224 || size_per_head_ == 256); if (int8_mode_ == 1) { FT_CHECK_WITH_INFO(!(std::is_same::value), "Weight only quant not supported for fp32."); weight_only_int8_fc_runner_ = std::make_shared>(); diff --git a/src/fastertransformer/layers/attention_layers/TensorParallelDecoderCrossAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/TensorParallelDecoderCrossAttentionLayer.cc index 8f14ff14a..0672b7150 100644 --- a/src/fastertransformer/layers/attention_layers/TensorParallelDecoderCrossAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/TensorParallelDecoderCrossAttentionLayer.cc @@ -104,11 +104,12 @@ void TensorParallelDecoderCrossAttentionLayer::forward(TensorMap* // value_cache [batch, head_num, max_seq_len, size_per_head] const size_t size = output_tensors->at("hidden_features").size(); + std::vector reduce_tensor{output_tensors->at("hidden_features")}; bool use_custom_all_reduce_kernel = false; if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) { - std::vector reduce_tensor{output_tensors->at("hidden_features")}; use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&reduce_tensor, size); + output_tensors->at("hidden_features").data = reduce_tensor[0].data; } DecoderCrossAttentionLayer::forward(output_tensors, input_tensors, attention_weights); @@ -121,6 +122,7 @@ void TensorParallelDecoderCrossAttentionLayer::forward(TensorMap* } else { custom_all_reduce_comm_->customAllReduce(size, DecoderCrossAttentionLayer::stream_); + output_tensors->at("hidden_features").data = reduce_tensor[0].data; } sync_check_cuda_error(); } diff --git a/src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.cc index 4eb9159cf..fbb726e2d 100644 --- a/src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.cc @@ -200,11 +200,12 @@ void TensorParallelDecoderSelfAttentionLayer::forward(TensorMap* // value_cache [batch, head_num, max_seq_len, size_per_head] const size_t size = output_tensors->at("hidden_features").size(); + std::vector reduce_tensor{output_tensors->at("hidden_features")}; bool use_custom_all_reduce_kernel = false; if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr && do_all_reduce_) { - std::vector reduce_tensor{output_tensors->at("hidden_features")}; use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&reduce_tensor, size); + output_tensors->at("hidden_features").data = reduce_tensor[0].data; } DecoderSelfAttentionLayer::forward(output_tensors, input_tensors, attention_weights); @@ -217,6 +218,7 @@ void TensorParallelDecoderSelfAttentionLayer::forward(TensorMap* } else { custom_all_reduce_comm_->customAllReduce(size, DecoderSelfAttentionLayer::stream_); + output_tensors->at("hidden_features").data = reduce_tensor[0].data; } sync_check_cuda_error(); } diff --git a/src/fastertransformer/layers/attention_layers/TensorParallelDisentangledAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/TensorParallelDisentangledAttentionLayer.cc index dd66344a7..72840a960 100644 --- a/src/fastertransformer/layers/attention_layers/TensorParallelDisentangledAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/TensorParallelDisentangledAttentionLayer.cc @@ -35,11 +35,12 @@ void TensorParallelDisentangledAttentionLayer::forward(TensorMap* // For more information, please refer to DisentangledAttentionLayer const size_t size = output_tensors->at("hidden_features").size(); + std::vector hidden_features_reduce = {output_tensors->at("hidden_features")}; bool use_custom_all_reduce_kernel = false; if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) { - std::vector hidden_features_reduce = {output_tensors->at("hidden_features")}; use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&hidden_features_reduce, size); + output_tensors->at("hidden_features").data = hidden_features_reduce[0].data; } DisentangledAttentionLayer::forward(output_tensors, input_tensors, attention_weights); @@ -52,6 +53,7 @@ void TensorParallelDisentangledAttentionLayer::forward(TensorMap* } else { custom_all_reduce_comm_->customAllReduce(size, DisentangledAttentionLayer::stream_); + output_tensors->at("hidden_features").data = hidden_features_reduce[0].data; } sync_check_cuda_error(); } diff --git a/src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.cc index 55cb5efd7..bb4140ea3 100644 --- a/src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.cc @@ -35,11 +35,12 @@ void TensorParallelGptContextAttentionLayer::forward(TensorMap* // value_cache [batch, local_head_num, max_seq_len, size_per_head] const size_t size = output_tensors->at("hidden_features").size(); + std::vector reduce_tensor{output_tensors->at("hidden_features")}; bool use_custom_all_reduce_kernel = false; if (do_all_reduce_ && enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) { - std::vector reduce_tensor{output_tensors->at("hidden_features")}; use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&reduce_tensor, size); + output_tensors->at("hidden_features").data = reduce_tensor[0].data; } GptContextAttentionLayer::forward(output_tensors, input_tensors, attention_weights); @@ -52,6 +53,7 @@ void TensorParallelGptContextAttentionLayer::forward(TensorMap* } else { custom_all_reduce_comm_->customAllReduce(size, GptContextAttentionLayer::stream_); + output_tensors->at("hidden_features").data = reduce_tensor[0].data; } sync_check_cuda_error(); } diff --git a/src/fastertransformer/layers/attention_layers/TensorParallelUnfusedAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/TensorParallelUnfusedAttentionLayer.cc index 0b7f2dc05..91cc6f416 100644 --- a/src/fastertransformer/layers/attention_layers/TensorParallelUnfusedAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/TensorParallelUnfusedAttentionLayer.cc @@ -37,11 +37,12 @@ void TensorParallelUnfusedAttentionLayer::forward(TensorMap* o // For more information, please refer to UnfusedAttentionLayer const size_t size = output_tensors->at("hidden_features").size(); + std::vector hidden_features_reduce = {output_tensors->at("hidden_features")}; bool use_custom_all_reduce_kernel = false; if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) { - std::vector hidden_features_reduce = {output_tensors->at("hidden_features")}; use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&hidden_features_reduce, size); + output_tensors->at("hidden_features").data = hidden_features_reduce[0].data; } UnfusedAttentionLayer::forward(output_tensors, input_tensors, attention_weights); @@ -53,6 +54,7 @@ void TensorParallelUnfusedAttentionLayer::forward(TensorMap* o } else { custom_all_reduce_comm_->customAllReduce(size, UnfusedAttentionLayer::stream_); + output_tensors->at("hidden_features").data = hidden_features_reduce[0].data; } sync_check_cuda_error(); } diff --git a/src/fastertransformer/models/bert/Bert.cc b/src/fastertransformer/models/bert/Bert.cc index 9b51c89cc..320fa29cf 100644 --- a/src/fastertransformer/models/bert/Bert.cc +++ b/src/fastertransformer/models/bert/Bert.cc @@ -510,10 +510,11 @@ void Bert::forward(TensorMap* output_tensors, TensorMap* input_tensors, const Tensor{MEMORY_GPU, data_type, std::vector{h_token_num, hidden_units_}, attn_out_buf_}}}); bool use_custom_all_reduce_kernel = false; + std::vector hidden_features{attn_output_tensors.at("hidden_features")}; if (enable_custom_all_reduce_ && custom_all_reduce_comm_ != nullptr) { - std::vector hidden_features{attn_output_tensors.at("hidden_features")}; use_custom_all_reduce_kernel = custom_all_reduce_comm_->swapInternalBuffer(&hidden_features, h_token_num * hidden_units_); + attn_output_tensors.at("hidden_features").data = hidden_features[0].data; } if (attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::FUSED_PADDED_MHA) { @@ -535,6 +536,7 @@ void Bert::forward(TensorMap* output_tensors, TensorMap* input_tensors, const } else { custom_all_reduce_comm_->customAllReduce(h_token_num * hidden_units_, stream_); + attn_output_tensors.at("hidden_features").data = hidden_features[0].data; } sync_check_cuda_error(); } diff --git a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h index 3662256c1..2850da466 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h +++ b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h @@ -28,7 +28,7 @@ namespace fastertransformer { template struct GptNeoXDecoderLayerWeight { public: - GptNeoXDecoderLayerWeight() = delete; + GptNeoXDecoderLayerWeight() = default; GptNeoXDecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size = 1, diff --git a/src/fastertransformer/models/gptneox/GptNeoXWeight.cc b/src/fastertransformer/models/gptneox/GptNeoXWeight.cc index 0f052a3a3..26995f255 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXWeight.cc +++ b/src/fastertransformer/models/gptneox/GptNeoXWeight.cc @@ -278,6 +278,16 @@ void GptNeoXWeight::loadModel(std::string dir_path) } } +template +void GptNeoXWeight::resizeLayer(const int num_layer) +{ + num_layer_ = num_layer; + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight()); + } +} + template bool GptNeoXWeight::isValidLayerParallelId(int l) { diff --git a/src/fastertransformer/models/gptneox/GptNeoXWeight.h b/src/fastertransformer/models/gptneox/GptNeoXWeight.h index 2a6b1764e..3e868854e 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXWeight.h +++ b/src/fastertransformer/models/gptneox/GptNeoXWeight.h @@ -47,6 +47,8 @@ struct GptNeoXWeight { void loadModel(std::string dir_path); + void resizeLayer(const int num_layer); + std::vector*> decoder_layer_weights; const T* pre_decoder_embedding_table = nullptr; // GPT-J does not use embedding table, but we leave the ptr such that @@ -65,6 +67,11 @@ struct GptNeoXWeight { LayerNormWeight post_decoder_layernorm; DenseWeight post_decoder_embedding; + inline void setMaxSeqLen(size_t max_seq_len) + { + max_seq_len_ = max_seq_len; + } + private: void setWeightPtr(); void mallocWeights(); diff --git a/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc b/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc index 2b9e4f3c4..93b80ae6e 100644 --- a/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc +++ b/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc @@ -101,7 +101,11 @@ void ParallelGpt::allocateBuffer(size_t batch_size, bool is_return_context_cum_log_probs) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - const size_t batchxbeam = batch_size * beam_width; + const size_t batchxbeam = batch_size * beam_width; + const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_); + FT_CHECK(batch_size % local_batch_size == 0); + const size_t num_microbatches = batch_size / local_batch_size; + const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batchxbeam * memory_len * hidden_units_ / tensor_para_.world_size_; @@ -111,8 +115,8 @@ void ParallelGpt::allocateBuffer(size_t batch_size, padded_embedding_kernel_ptr_ = padded_embedding_kernel_; } - input_attention_mask_ = (T*)(allocator_->reMalloc( - input_attention_mask_, sizeof(T) * batchxbeam * max_input_len * max_input_len, false)); + tiled_input_attention_mask_ = (T*)(allocator_->reMalloc( + tiled_input_attention_mask_, sizeof(T) * batchxbeam * max_input_len * max_input_len, false)); decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); decoder_normed_input_buf_ = (T*)(allocator_->reMalloc(decoder_normed_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); @@ -125,7 +129,6 @@ void ParallelGpt::allocateBuffer(size_t batch_size, (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false)); finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false)); - h_finished_buf_ = new bool[batchxbeam]; sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); @@ -154,7 +157,8 @@ void ParallelGpt::allocateBuffer(size_t batch_size, output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_session_len, true)); parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_session_len, true)); seq_limit_len_ = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false)); - masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * memory_len, true)); + tiled_masked_tokens_ = + (bool*)(allocator_->reMalloc(tiled_masked_tokens_, sizeof(bool) * batchxbeam * memory_len, true)); context_decoder_input_buf_ = (T*)(allocator_->reMalloc( context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); @@ -184,12 +188,13 @@ void ParallelGpt::allocateBuffer(size_t batch_size, lp_logprob_buf_ = (float*)allocator_->reMalloc(lp_logprob_buf_, sizeof(float) * batchxbeam * max_input_len); } if (shared_contexts_ratio_ > 0.0f) { - shared_contexts_idx_ = (int*)allocator_->reMalloc(shared_contexts_idx_, 3 * batch_size * sizeof(int), false); - batch_to_compact_idx_ = shared_contexts_idx_ + batch_size; - compact_idx_ = shared_contexts_idx_ + 2 * batch_size; + shared_contexts_idx_ = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false); + batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false); + compact_idx_ = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false); compact_size_ = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false); } - generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true); + microbatch_should_stop_ = + (bool*)allocator_->reMalloc(microbatch_should_stop_, sizeof(bool) * num_microbatches, true, true); tiled_total_padding_count_ = (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false); @@ -205,7 +210,7 @@ void ParallelGpt::freeBuffer() allocator_->free((void**)(&padded_embedding_kernel_)); } - allocator_->free((void**)(&input_attention_mask_)); + allocator_->free((void**)(&tiled_input_attention_mask_)); allocator_->free((void**)(&decoder_input_buf_)); allocator_->free((void**)(&decoder_output_buf_)); allocator_->free((void**)(&normed_decoder_output_buf_)); @@ -213,7 +218,6 @@ void ParallelGpt::freeBuffer() allocator_->free((void**)(&nccl_logits_buf_)); allocator_->free((void**)(&cum_log_probs_)); allocator_->free((void**)(&finished_buf_)); - delete[] h_finished_buf_; allocator_->free((void**)(&sequence_lengths_)); allocator_->free((void**)(&key_cache_)); @@ -230,7 +234,7 @@ void ParallelGpt::freeBuffer() allocator_->free((void**)(&transposed_output_ids_buf_)); allocator_->free((void**)(&output_ids_buf_)); allocator_->free((void**)(&parent_ids_buf_)); - allocator_->free((void**)(&masked_tokens_)); + allocator_->free((void**)(&tiled_masked_tokens_)); allocator_->free((void**)(&seq_limit_len_)); @@ -254,7 +258,7 @@ void ParallelGpt::freeBuffer() allocator_->free((void**)(&lp_nccl_logits_buf_)); allocator_->free((void**)(&lp_logprob_buf_)); - allocator_->free((void**)(&generation_should_stop_), true); + allocator_->free((void**)(µbatch_should_stop_), true); if (shared_contexts_ratio_ > 0.0f) { allocator_->free((void**)(&shared_contexts_idx_)); @@ -416,6 +420,8 @@ void ParallelGpt::computeContextCumLogProbs(float* cum_l const size_t batchxbeam = batch_size * beam_width; const size_t n_hidden_states = batchxbeam * max_input_length; + const cudaDataType_t cublas_type = getCudaDataType(); + if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { // normed decoder output [batch_size * beam_width, max_input_length, hidden_units_] invokeGeneralLayerNorm(lp_normed_decoder_output_buf_, @@ -439,10 +445,10 @@ void ParallelGpt::computeContextCumLogProbs(float* cum_l hidden_units_, // k &alpha, padded_embedding_kernel_ptr_, - sizeof(T) == 2 ? CUDA_R_16F : CUDA_R_32F, + cublas_type, hidden_units_, // k lp_normed_decoder_output_buf_, - sizeof(T) == 2 ? CUDA_R_16F : CUDA_R_32F, + cublas_type, hidden_units_, // k &beta, lp_logits_buf_, @@ -464,10 +470,10 @@ void ParallelGpt::computeContextCumLogProbs(float* cum_l hidden_units_, // k &alpha, padded_embedding_kernel_ptr_ + tensor_para_.rank_ * local_vocab_size * hidden_units_, - sizeof(T) == 2 ? CUDA_R_16F : CUDA_R_32F, + cublas_type, hidden_units_, // k lp_normed_decoder_output_buf_, - sizeof(T) == 2 ? CUDA_R_16F : CUDA_R_32F, + cublas_type, hidden_units_, // k &beta, lp_nccl_logits_buf_ + tensor_para_.rank_ * n_hidden_states * local_vocab_size, @@ -809,8 +815,9 @@ void ParallelGpt::forward(std::unordered_map* outp num_layer_ / pipeline_para_.world_size_, batch_size * beam_width, local_head_num_, memory_len, size_per_head_}; { - PUSH_RANGE("dynamic decode setup"); TensorMap input_map(*input_tensors); + + PUSH_RANGE("dynamic decode setup"); dynamic_decode_layer_->setup(batch_size, beam_width, &input_map); handleOptArg(&input_map, "start_id", start_ids_buf_, start_id_, batch_size); handleOptArg(&input_map, "end_id", end_ids_buf_, end_id_, batch_size); @@ -858,7 +865,7 @@ void ParallelGpt::forward(std::unordered_map* outp PUSH_RANGE("initialize output and parent ids"); cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * session_len, stream_); cudaMemsetAsync(parent_ids_buf_, 0, sizeof(int) * batch_size * beam_width * session_len, stream_); - cudaMemsetAsync(masked_tokens_, false, sizeof(bool) * batch_size * beam_width * memory_len, stream_); + cudaMemsetAsync(tiled_masked_tokens_, false, sizeof(bool) * batch_size * beam_width * memory_len, stream_); cudaMemsetAsync(tiled_total_padding_count_, 0, sizeof(int) * batch_size * beam_width, stream_); if (beam_width > 1) { cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * memory_len, stream_); @@ -889,6 +896,7 @@ void ParallelGpt::forward(std::unordered_map* outp compact_size_, input_tensors->at("input_ids").getPtr(), batch_size, + beam_width, max_input_length, stream_); cudaD2Hcpy(&compact_size, compact_size_, 1); @@ -1028,7 +1036,7 @@ void ParallelGpt::forward(std::unordered_map* outp POP_RANGE; } PUSH_RANGE("build decoder attention mask"); - invokeBuildDecoderAttentionMask(input_attention_mask_, + invokeBuildDecoderAttentionMask(tiled_input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size * beam_width, @@ -1049,15 +1057,16 @@ void ParallelGpt::forward(std::unordered_map* outp Tensor(MEMORY_GPU, data_type, {batch_size * beam_width, 1, (size_t)max_input_length, (size_t)max_input_length}, - input_attention_mask_)}, + tiled_input_attention_mask_)}, {"input_lengths", Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_)}}); if (use_shared_contexts) { decoder_input_tensors.insert("compact_idx", Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_)); - decoder_input_tensors.insert("batch_to_compact_idx", - Tensor(MEMORY_GPU, TYPE_INT32, {batch_size}, batch_to_compact_idx_)); + decoder_input_tensors.insert( + "batch_to_compact_idx", + Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_)); } if (gpt_variant_params_.use_attention_linear_bias) { decoder_input_tensors.insert("linear_bias_slopes", @@ -1169,7 +1178,7 @@ void ParallelGpt::forward(std::unordered_map* outp } PUSH_RANGE("mask padding tokens"); - invokeMaskPaddingTokens(masked_tokens_, + invokeMaskPaddingTokens(tiled_masked_tokens_, input_tensors->at("input_lengths").getPtr(), memory_len, max_input_length, @@ -1184,6 +1193,10 @@ void ParallelGpt::forward(std::unordered_map* outp const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_); FT_CHECK(batch_size % local_batch_size == 0); + const size_t iteration_num = batch_size / local_batch_size; + for (int microbatch = 0; microbatch < iteration_num; ++microbatch) { + microbatch_should_stop_[microbatch] = false; + } for (step_ = step_start; step_ < (int)gen_len; step_++) { // Loop body produces Nth token by embedding && encoding token (N-1) @@ -1192,11 +1205,14 @@ void ParallelGpt::forward(std::unordered_map* outp const int src_indir_idx = (step_ - step_start) % 2; const int tgt_indir_idx = 1 - src_indir_idx; - const size_t iteration_num = batch_size / local_batch_size; - *generation_should_stop_ = !fill_caches_only; + bool generation_should_stop = !fill_caches_only; PUSH_RANGE(fmtstr("token_%d", step_ - step_start)); for (uint ite = 0; ite < iteration_num; ++ite) { + // skip the finished microbatch in previous steps + if (microbatch_should_stop_[ite]) { + continue; + } const int id_offset = ite * local_batch_size * beam_width; const int hidden_units_offset = id_offset * hidden_units_; const int vocab_size_units_offset = id_offset * vocab_size_padded_; @@ -1214,10 +1230,9 @@ void ParallelGpt::forward(std::unordered_map* outp pipeline_para_, stream_); - // receive updated generation_should_stop_ from last rank - if (ite == 0) { - ftNcclRecv(generation_should_stop_, 1, pipeline_para_.world_size_ - 1, pipeline_para_, stream_); - } + // receive updated microbatch_should_stop_ from last rank + ftNcclRecv(microbatch_should_stop_ + ite, 1, pipeline_para_.world_size_ - 1, pipeline_para_, stream_); + generation_should_stop &= microbatch_should_stop_[ite]; // receive updated cache_indirections from last rank if (beam_width > 1) { @@ -1241,10 +1256,10 @@ void ParallelGpt::forward(std::unordered_map* outp // throw errors when detected ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_); sync_check_cuda_error(); - - if (ite == 0 && *generation_should_stop_) { - break; - } + } + // skip the microbatch for last step, which is updated by last rank + if (microbatch_should_stop_[ite]) { + continue; } if ((max_input_length <= 1) || (step_ > step_start) || continue_gen) { @@ -1302,7 +1317,7 @@ void ParallelGpt::forward(std::unordered_map* outp Tensor(MEMORY_GPU, TYPE_BOOL, {local_batch_size * beam_width, memory_len}, - masked_tokens_ + id_offset * memory_len)}}); + tiled_masked_tokens_ + id_offset * memory_len)}}); if (beam_width > 1) { decoder_input_tensors.insert({"cache_indirection", Tensor(MEMORY_GPU, @@ -1403,7 +1418,7 @@ void ParallelGpt::forward(std::unordered_map* outp CUDA_R_32F, cublasGemmAlgo_t(-1)); POP_RANGE; - PUSH_RANGE("logits all reduce sum"); + PUSH_RANGE("logits all gather"); ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset, nccl_logits_buf_ + vocab_size_units_offset, local_batch_size * beam_width * local_vocab_size, @@ -1484,9 +1499,14 @@ void ParallelGpt::forward(std::unordered_map* outp PUSH_RANGE("result sampling and stop check"); dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); - *generation_should_stop_ &= subbatch_should_stop; + generation_should_stop &= subbatch_should_stop; + microbatch_should_stop_[ite] = subbatch_should_stop; POP_RANGE; } + else { + // for other ranks, they cannot update generation_should_stop by DynamicDecode, set to false directly; + generation_should_stop &= microbatch_should_stop_[ite]; + } PUSH_RANGE("result communication"); // send results to other rank @@ -1504,10 +1524,8 @@ void ParallelGpt::forward(std::unordered_map* outp ftNcclSend( sequence_lengths_ + id_offset, local_batch_size * beam_width, i, pipeline_para_, stream_); - // send updated generation_should_stop_ - if (ite == 0) { - ftNcclSend(generation_should_stop_, 1, i, pipeline_para_, stream_); - } + // send updated microbatch_should_stop_ + ftNcclSend(microbatch_should_stop_ + ite, 1, i, pipeline_para_, stream_); // send updated cache_indirections if (beam_width > 1) { @@ -1547,13 +1565,20 @@ void ParallelGpt::forward(std::unordered_map* outp if (step_ == initial_step + max_input_length) { /* We have just finished processing input: update the padding count: * total_padding_count += (max_input_length - input_lengths) */ + PUSH_RANGE("Update padding count"); invokeUpdatePaddingCount(tiled_total_padding_count_, input_tensors->at("input_lengths").getPtr(), max_input_length, batch_size, beam_width, stream_); + POP_RANGE; + } + + if (generation_should_stop) { + break; } + POP_RANGE; } PUSH_RANGE("communicate tensors"); @@ -1605,6 +1630,7 @@ void ParallelGpt::setOutputTensors(std::unordered_map* const size_t max_context_len, const size_t max_input_without_prompt_length) { + PUSH_RANGE("Resolve output tensors"); if (pipeline_para_.rank_ != pipeline_para_.world_size_ - 1) { return; } @@ -1706,6 +1732,7 @@ void ParallelGpt::setOutputTensors(std::unordered_map* cudaD2Dcpy( output_tensors->at("is_finished").getPtr(), finished_buf_, output_tensors->at("is_finished").size()); } + POP_RANGE; } template diff --git a/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h b/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h index 39b6bab5e..ea24de2d3 100644 --- a/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h +++ b/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h @@ -116,7 +116,7 @@ class ParallelGpt: public BaseLayer { T* padded_embedding_kernel_; const T* padded_embedding_kernel_ptr_; - T* input_attention_mask_; + T* tiled_input_attention_mask_; T* decoder_input_buf_; T* decoder_normed_input_buf_ = nullptr; @@ -126,10 +126,9 @@ class ParallelGpt: public BaseLayer { float* nccl_logits_buf_; float* cum_log_probs_; bool* finished_buf_; - bool* h_finished_buf_; int* sequence_lengths_ = nullptr; uint32_t* seq_limit_len_ = nullptr; - bool* generation_should_stop_ = nullptr; + bool* microbatch_should_stop_ = nullptr; int* shared_contexts_idx_ = nullptr; T* compact_decoder_features_ = nullptr; @@ -154,7 +153,7 @@ class ParallelGpt: public BaseLayer { int* transposed_output_ids_buf_; int* output_ids_buf_; int* parent_ids_buf_; - bool* masked_tokens_ = nullptr; + bool* tiled_masked_tokens_ = nullptr; T* context_decoder_input_buf_; T* context_decoder_normed_input_buf_; diff --git a/src/fastertransformer/models/t5/T5DecodingWeight.cc b/src/fastertransformer/models/t5/T5DecodingWeight.cc index 09a657fe0..e99f7145f 100644 --- a/src/fastertransformer/models/t5/T5DecodingWeight.cc +++ b/src/fastertransformer/models/t5/T5DecodingWeight.cc @@ -269,9 +269,11 @@ void T5DecodingWeight::loadModel(std::string dir_path) } if (t5_with_bias) { - loadWeightFromBin( - weights_ptr[4], {(size_t)weights_size[4]}, dir_path + "/decoder.final_layer_norm.bias.bin"); - loadWeightFromBin(weights_ptr[5], {(size_t)weights_size[5]}, dir_path + "/shared.bias.bin"); + loadWeightFromBin(weights_ptr[4], + {(size_t)weights_size[4]}, + dir_path + "/decoder.final_layer_norm.bias.bin", + model_file_type); + loadWeightFromBin(weights_ptr[5], {(size_t)weights_size[5]}, dir_path + "/shared.bias.bin", model_file_type); } for (int l = 0; l < num_layer_; l++) { diff --git a/src/fastertransformer/th_op/CMakeLists.txt b/src/fastertransformer/th_op/CMakeLists.txt index ddd6be058..b9f2b9151 100644 --- a/src/fastertransformer/th_op/CMakeLists.txt +++ b/src/fastertransformer/th_op/CMakeLists.txt @@ -27,6 +27,7 @@ add_subdirectory(longformer) add_subdirectory(swin) add_subdirectory(vit) add_subdirectory(multi_gpu_gpt) +add_subdirectory(gptneox) add_subdirectory(t5) add_subdirectory(bart) add_subdirectory(bert) @@ -43,6 +44,7 @@ add_library(th_transformer SHARED $ $ $ + $ $ $ $ @@ -59,6 +61,7 @@ target_link_libraries(th_transformer PUBLIC "${TORCH_LIBRARIES}" th_gather_tree th_longformer th_parallel_gpt + th_gptneox th_swintransformer th_t5 th_utils diff --git a/src/fastertransformer/th_op/common/GptOps.cc b/src/fastertransformer/th_op/common/GptOps.cc index ea3a86887..fbb018085 100644 --- a/src/fastertransformer/th_op/common/GptOps.cc +++ b/src/fastertransformer/th_op/common/GptOps.cc @@ -48,6 +48,7 @@ std::vector find_context_duplications(Tensor input_ids) get_ptr(compact_size_tensor), get_ptr(input_ids), batch_size, + 1, seq_len, stream); diff --git a/src/fastertransformer/th_op/gptneox/CMakeLists.txt b/src/fastertransformer/th_op/gptneox/CMakeLists.txt new file mode 100755 index 000000000..dcebaa80c --- /dev/null +++ b/src/fastertransformer/th_op/gptneox/CMakeLists.txt @@ -0,0 +1,17 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(th_gptneox STATIC GptNeoXOp.cc) +set_property(TARGET th_gptneox PROPERTY POSITION_INDEPENDENT_CODE ON) +target_link_libraries(th_gptneox PRIVATE "${TORCH_LIBRARIES}" GptNeoX th_utils nccl_utils) diff --git a/src/fastertransformer/th_op/gptneox/GptNeoXOp.cc b/src/fastertransformer/th_op/gptneox/GptNeoXOp.cc new file mode 100755 index 000000000..09e09c8e0 --- /dev/null +++ b/src/fastertransformer/th_op/gptneox/GptNeoXOp.cc @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/th_op/gptneox/GptNeoXOp.h" + +namespace th = torch; +namespace ft = fastertransformer; +namespace torch_ext { + +GptNeoXOp::GptNeoXOp(const int64_t head_num, + const int64_t size_per_head, + const int64_t inter_size, + const int64_t layer_num, + const int64_t vocab_size, + const int64_t rotary_embedding_dim, + const int64_t start_id, + const int64_t end_id, + const int64_t tensor_para_size, + const int64_t pipeline_para_size, + const int64_t max_seq_len, + const bool use_gptj_residual, + const vector weights): + st_(weights[0].scalar_type()) +{ + for (auto t : weights) { + CHECK_INPUT(t, st_); + } + + switch (st_) { + case at::ScalarType::Float: + ftgpt = new FTGptNeoX((size_t)head_num, + (size_t)size_per_head, + (size_t)inter_size, + (size_t)layer_num, + (size_t)vocab_size, + (size_t)rotary_embedding_dim, + start_id, + end_id, + tensor_para_size, + pipeline_para_size, + (size_t)max_seq_len, + use_gptj_residual, + weights); + break; + case at::ScalarType::Half: + ftgpt = new FTGptNeoX((size_t)head_num, + (size_t)size_per_head, + (size_t)inter_size, + (size_t)layer_num, + (size_t)vocab_size, + (size_t)rotary_embedding_dim, + start_id, + end_id, + tensor_para_size, + pipeline_para_size, + (size_t)max_seq_len, + use_gptj_residual, + weights); + break; + default: + throw std::runtime_error("Wrong Tensor type."); + } +} + +GptNeoXOp::~GptNeoXOp() +{ + delete ftgpt; +} + +std::vector GptNeoXOp::forward(th::Tensor input_ids, + th::Tensor input_lengths, + const int64_t output_len, + th::optional beam_width_opt, + th::optional top_k_opt, + th::optional top_p_opt, + th::optional beam_search_diversity_rate_opt, + th::optional temperature_opt, + th::optional len_penalty_opt, + th::optional repetition_penalty_opt, + th::optional random_seed_opt, + th::optional return_cum_log_probs_opt) +{ + CHECK_TH_CUDA(input_ids); + CHECK_CONTIGUOUS(input_ids); + TORCH_CHECK(input_ids.dtype() == torch::kInt32, "input_ids dtype should be int32"); + CHECK_TH_CUDA(input_lengths); + CHECK_CONTIGUOUS(input_lengths); + TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32"); + int64_t return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int64_t)return_cum_log_probs_opt.value() : 0; + if (return_cum_log_probs_opt.has_value()) { + TORCH_CHECK(return_cum_log_probs == 0 || return_cum_log_probs == 1, + "return_cum_log_probs should be" + " 0 (no return cum_log_probs), " + " 1 (the cumulative log probs of generated sequences)") + } + + const int beam_width = beam_width_opt.has_value() ? (int)beam_width_opt.value() : 1; + + const int batch_size = input_ids.size(0); + const int max_input_length = input_ids.size(1); + const int total_request_output_len = max_input_length + output_len; + th::Tensor output_ids = torch::empty({batch_size, beam_width, total_request_output_len}, + torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false)); + th::Tensor sequence_lengths = + torch::empty({batch_size, beam_width}, torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false)); + th::Tensor cum_log_probs = + torch::empty({batch_size, beam_width}, torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false)); + + ftgpt->forward(input_ids, + input_lengths, + output_ids, + sequence_lengths, + cum_log_probs, + (const size_t)output_len, + (const size_t)beam_width, + top_k_opt, + top_p_opt, + beam_search_diversity_rate_opt, + temperature_opt, + len_penalty_opt, + repetition_penalty_opt, + random_seed_opt, + return_cum_log_probs_opt); + if (return_cum_log_probs > 0) { + return std::vector{output_ids, sequence_lengths, cum_log_probs}; + } + return std::vector{output_ids, sequence_lengths}; +} + +} // namespace torch_ext + +static auto fasterTransformerGptTHS = +#ifdef LEGACY_THS + torch::jit::class_("FasterTransformerGptNeoXOp") +#else + torch::jit::class_("FasterTransformer", "GptNeoXOp") +#endif + .def(torch::jit::init>()) + .def("forward", &torch_ext::GptNeoXOp::forward); diff --git a/src/fastertransformer/th_op/gptneox/GptNeoXOp.h b/src/fastertransformer/th_op/gptneox/GptNeoXOp.h new file mode 100755 index 000000000..222fdd409 --- /dev/null +++ b/src/fastertransformer/th_op/gptneox/GptNeoXOp.h @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/gptneox/GptNeoX.h" +#include "src/fastertransformer/th_op/th_utils.h" +#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" +#include "src/fastertransformer/utils/nccl_utils.h" + +namespace ft = fastertransformer; +namespace th = torch; +namespace torch_ext { + +using std::vector; + +class IFGptNeoX { +public: + virtual ~IFGptNeoX() {} + virtual void forward(th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& output_ids, + th::Tensor& sequence_lengths, + th::Tensor& cum_log_probs, + const size_t request_output_len, + const size_t beam_width, + th::optional top_k_opt, + th::optional top_p_opt, + th::optional beam_search_diversity_rate_opt, + th::optional temperature_opt, + th::optional len_penalty_opt, + th::optional repetition_penalty_opt, + th::optional random_seed_opt, + th::optional return_cum_log_probs_opt) = 0; +}; + +template +class FTGptNeoX: public IFGptNeoX { +public: + FTGptNeoX(const size_t head_num, + const size_t size_per_head, + const size_t inter_size, + const size_t layer_num, + const size_t vocab_size, + const size_t rotary_embedding_dim, + const int start_id, + const int end_id, + const int64_t tensor_para_size, + const int64_t pipeline_para_size, + const size_t max_seq_len, + const bool use_gptj_residual, + const vector weights): + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + layer_num_(layer_num), + vocab_size_(vocab_size), + rotary_embedding_dim_(rotary_embedding_dim), + start_id_(start_id), + end_id_(end_id), + use_gptj_residual_(use_gptj_residual), + weights_(weights), + tensor_para_size_(tensor_para_size), + pipeline_para_size_(pipeline_para_size) + { + ft::check_cuda_error(cublasLtCreate(&cublasltHandle_)); + cublas_algo_map_ = new ft::cublasAlgoMap(GEMM_CONFIG, ""); + cublas_wrapper_mutex_ = new std::mutex(); + + ftNcclInitialize(tensor_para_, pipeline_para_, tensor_para_size, pipeline_para_size); + + gpt_weights_.resizeLayer(layer_num_); + for (int i = 0; i < (int)layer_num_; i++) { + gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta = + get_ptr(weights_[i + 0 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma = + get_ptr(weights_[i + 1 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel = + get_ptr(weights_[i + 2 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias = + get_ptr(weights_[i + 3 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel = + get_ptr(weights_[i + 4 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias = + get_ptr(weights_[i + 5 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel = + get_ptr(weights_[i + 6 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias = + get_ptr(weights_[i + 7 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel = + get_ptr(weights_[i + 8 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias = + get_ptr(weights_[i + 9 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta = + get_ptr(weights_[i + 10 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma = + get_ptr(weights_[i + 11 * layer_num_]); + } + + gpt_weights_.pre_decoder_embedding_table = get_ptr(weights_[12 * layer_num_ + 0]); + gpt_weights_.post_decoder_layernorm.gamma = get_ptr(weights_[12 * layer_num_ + 1]); + gpt_weights_.post_decoder_layernorm.beta = get_ptr(weights_[12 * layer_num_ + 2]); + gpt_weights_.post_decoder_embedding.kernel = get_ptr(weights_[12 * layer_num_ + 3]); + + gpt_weights_.setMaxSeqLen(max_seq_len); + + ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); + } + + ~FTGptNeoX() override + { + ft::ftNcclParamDestroy(tensor_para_); + ft::ftNcclParamDestroy(pipeline_para_); + cublasLtDestroy(cublasltHandle_); + delete cublas_algo_map_; + delete cublas_wrapper_mutex_; + } + + void forward(th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& output_ids, + th::Tensor& sequence_lengths, + th::Tensor& cum_log_probs, + const size_t request_output_len, + const size_t beam_width, + th::optional top_k_opt, + th::optional top_p_opt, + th::optional beam_search_diversity_rate_opt, + th::optional temperature_opt, + th::optional len_penalty_opt, + th::optional repetition_penalty_opt, + th::optional random_seed_opt, + th::optional return_cum_log_probs_opt) override + { + int return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int)return_cum_log_probs_opt.value() : 0; + + auto stream = at::cuda::getCurrentCUDAStream().stream(); + cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); + cublasSetStream(cublasHandle, stream); + ft::Allocator allocator = ft::Allocator(); + ft::cublasMMWrapper cublas_wrapper = ft::cublasMMWrapper( + cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, &allocator); + + if (std::is_same::value) { + cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); + } + else if (std::is_same::value) { + cublas_wrapper.setFP32GemmConfig(); + } + + const size_t request_batch_size = (size_t)input_ids.size(0); + const size_t max_input_length = (size_t)input_ids.size(1); + const int total_output_len = (int)(max_input_length + request_output_len); + + ft::AttentionType attention_type = ft::getAttentionType(size_per_head_, + ft::getSMVersion(), + true, // remove_padding + 0, // gpt supports any-seq-length fmha + true, // is_fuse + false, // with_relative_position_bias + true); // causal_mask + + ft::GptNeoX gpt = ft::GptNeoX(head_num_, + size_per_head_, + inter_size_, + layer_num_, + vocab_size_, + rotary_embedding_dim_, + start_id_, + end_id_, + end_id_ + 1, // p/prompt tuning virtual token start id + ft::PromptLearningType::no_prompt, + use_gptj_residual_, + 0.0f, // beam_search_diversity_rate, + 1, // top_k, + 0.0, // top_p, + 0, // random_seed, + 1.0f, // temperature, + 1.0f, // len_penalty, + 1.0f, // repetition_penalty, + tensor_para_, + pipeline_para_, + stream, + &cublas_wrapper, + &allocator, + false, // is_free_buffer_after_forward + &prop_, // cuda_device_prop + attention_type, // attention_type + nullptr, // custom_all_reduce_comm + 0); // enable_custom_all_reduce + + std::vector output_seq_len(request_batch_size, total_output_len); + + std::unordered_map input_tensors = std::unordered_map{ + {"input_ids", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_INT32, + std::vector{request_batch_size, max_input_length}, + get_ptr(input_ids)}}, + {"input_lengths", + ft::Tensor{ + ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{request_batch_size}, get_ptr(input_lengths)}}, + {"output_seq_len", + ft::Tensor{ + ft::MEMORY_CPU, ft::TYPE_UINT32, std::vector{request_batch_size}, output_seq_len.data()}}}; + if (beam_width > 1 && beam_search_diversity_rate_opt.has_value()) { + input_tensors.insert( + {"beam_search_diversity_rate", + convert_tensor(beam_search_diversity_rate_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (top_p_opt.has_value()) { + input_tensors.insert( + {"runtime_top_p", convert_tensor(top_p_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (top_k_opt.has_value()) { + input_tensors.insert( + {"runtime_top_k", convert_tensor(top_k_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (temperature_opt.has_value()) { + input_tensors.insert( + {"temperature", convert_tensor(temperature_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (len_penalty_opt.has_value()) { + input_tensors.insert( + {"len_penalty", convert_tensor(len_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (repetition_penalty_opt.has_value()) { + input_tensors.insert({"repetition_penalty", + convert_tensor(repetition_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (random_seed_opt.has_value()) { + input_tensors.insert( + {"random_seed", + convert_tensor(random_seed_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + + std::unordered_map output_tensors = std::unordered_map{ + {"output_ids", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_INT32, + std::vector{request_batch_size, beam_width, (size_t)total_output_len}, + get_ptr(output_ids)}}, + {"sequence_length", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_INT32, + std::vector{request_batch_size, beam_width}, + get_ptr(sequence_lengths)}}}; + + if (return_cum_log_probs > 0) { + output_tensors.insert({"cum_log_probs", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_FP32, + std::vector{request_batch_size, beam_width}, + get_ptr(cum_log_probs)}}); + } + + try { + gpt.forward(&output_tensors, &input_tensors, &gpt_weights_); + } + catch (std::runtime_error& error) { + std::cout << error.what(); + exit(-1); + } + catch (...) { + std::cout << "Runtime error"; + exit(-1); + } + } + +private: + const size_t head_num_; + const size_t size_per_head_; + const size_t inter_size_; + const size_t layer_num_; + const size_t vocab_size_; + const size_t rotary_embedding_dim_; + const int start_id_; + const int end_id_; + const bool use_gptj_residual_; + + // const ft::gptVariantParams gpt_variant_params_; + + std::vector weights_; + cublasLtHandle_t cublasltHandle_; + std::mutex* cublas_wrapper_mutex_; + ft::cublasAlgoMap* cublas_algo_map_; + struct cudaDeviceProp prop_; + ft::GptNeoXWeight gpt_weights_; + + ft::NcclParam tensor_para_; + ft::NcclParam pipeline_para_; + + int64_t tensor_para_size_; + int64_t pipeline_para_size_; +}; + +class GptNeoXOp: public th::jit::CustomClassHolder { +public: + GptNeoXOp(const int64_t head_num, + const int64_t size_per_head, + const int64_t inter_size, + const int64_t layer_num, + const int64_t vocab_size, + const int64_t rotary_embedding_dim, + const int64_t start_id, + const int64_t end_id, + const int64_t tensor_para_size, + const int64_t pipeline_para_size, + const int64_t max_seq_len, + const bool use_gptj_residual, + const vector weights); + + ~GptNeoXOp(); + + vector forward(th::Tensor input_ids, + th::Tensor input_lengths, + const int64_t output_len, + th::optional beam_width_opt, + th::optional top_k_opt, + th::optional top_p_opt, + th::optional beam_search_diversity_rate_opt, + th::optional temperature_opt, + th::optional len_penalty_opt, + th::optional repetition_penalty_opt, + th::optional random_seed_opt, + th::optional return_cum_log_probs_opt); + +private: + const at::ScalarType st_; + IFGptNeoX* ftgpt; + std::vector weights; +}; + +} // namespace torch_ext diff --git a/src/fastertransformer/utils/cublasMMWrapper.cc b/src/fastertransformer/utils/cublasMMWrapper.cc index 12e6c8f0a..baf460fdc 100644 --- a/src/fastertransformer/utils/cublasMMWrapper.cc +++ b/src/fastertransformer/utils/cublasMMWrapper.cc @@ -799,7 +799,7 @@ std::pair cublasMMWrapper::findBestAlgo(cublasLtHand cublasLtMatrixLayout_t Ddesc, cudaStream_t stream) { -#if (CUBLAS_VERSION) <= 11402 +#if (CUBLAS_VERSION) < 11601 FT_CHECK_WITH_INFO(false, "CUBLAS version too low."); return {false, cublasLtMatmulAlgo_t{}}; #else @@ -984,7 +984,7 @@ void cublasMMWrapper::_Int8Gemm(const int m, * - 0: int8 * int8 -> int32 -> int8 * - 1: int8 * int8 -> int32 -> int32 */ -#if (CUBLAS_VERSION) <= 11402 +#if (CUBLAS_VERSION) < 11601 FT_CHECK_WITH_INFO(false, "CUBLAS version too low."); #else diff --git a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc index 165206710..474a8c81a 100644 --- a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc +++ b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc @@ -751,14 +751,14 @@ template void generate_gpt_gemm_config<__nv_fp8_e4m3>(int batch_size, bool isAppend); #endif -size_t calGptGemmTestBufSizeInByte(int batch_size, - int beam_width, - int max_input_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int tensor_para_size, +size_t calGptGemmTestBufSizeInByte(size_t batch_size, + size_t beam_width, + size_t max_input_len, + size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t vocab_size, + size_t tensor_para_size, CublasDataType data_type) { size_t buf_size_in_byte = 0; diff --git a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h index 82eec3b1b..336ef95f6 100644 --- a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h +++ b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h @@ -50,14 +50,14 @@ void generate_gpt_gemm_config(int batch_size, void* buffer_in, bool isAppend); -size_t calGptGemmTestBufSizeInByte(int batch_size, - int beam_width, - int max_input_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int tensor_para_size, +size_t calGptGemmTestBufSizeInByte(size_t batch_size, + size_t beam_width, + size_t max_input_len, + size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t vocab_size, + size_t tensor_para_size, CublasDataType data_type); } // namespace fastertransformer diff --git a/tests/decoding/tf_fused_self_multihead_attention_unit_test.py b/tests/decoding/tf_fused_self_multihead_attention_unit_test.py index a4a7031e5..a09c0028d 100644 --- a/tests/decoding/tf_fused_self_multihead_attention_unit_test.py +++ b/tests/decoding/tf_fused_self_multihead_attention_unit_test.py @@ -56,12 +56,12 @@ def test_attn_head_fp16(self): self.run_attn(4, 128, head, 64, tf.float16) def test_attn_size_fp32(self): - for size in [32, 64, 80, 96, 128, 144, 160, 192, 224, 256]: + for size in [32, 64, 80, 96, 112, 128, 144, 160, 192, 224, 256]: tf.reset_default_graph() self.run_attn(4, 128, 12, size, tf.float32) def test_attn_size_fp16(self): - for size in [32, 64, 80, 96, 128, 144, 160, 192, 224, 256]: + for size in [32, 64, 80, 96, 112, 128, 144, 160, 192, 224, 256]: tf.reset_default_graph() self.run_attn(4, 128, 12, size, tf.float16) @@ -171,4 +171,4 @@ def run_attn(self, batch_size, seq_len, head_num, size_per_head, data_type): assert(v_cache_max_diff < threshold) if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/unittests/test_gpt_kernels.cu b/tests/unittests/test_gpt_kernels.cu index cef959078..c41308b8c 100644 --- a/tests/unittests/test_gpt_kernels.cu +++ b/tests/unittests/test_gpt_kernels.cu @@ -85,6 +85,7 @@ int test_find_context_dups() d_compact_size, d_input_ids, batch_size, + 1,//beam_width vec_size); int compact_size; From da9ef99a3ea0c2e3a1f1fbc083aeb76452d6b32f Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:01:20 -0700 Subject: [PATCH 02/79] commit --- examples/cpp/llama/llama_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index ef789d35d..f9af743e7 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -6,7 +6,7 @@ tensor_para_size=1 pipeline_para_size=1 model_name=llama_7b -model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/ +model_dir=/notebooks/llama2-7b-chat-tp8/ [request] beam_width=1 # beam width for beam search From 7f1e8bfeefd50ecb3889715eb3b67513871454ec Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:01:59 -0700 Subject: [PATCH 03/79] commit --- examples/cpp/llama/llama_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index f9af743e7..616a7581b 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -2,7 +2,7 @@ data_type=fp16 enable_custom_all_reduce=0 -tensor_para_size=1 +tensor_para_size=8 pipeline_para_size=1 model_name=llama_7b From bb6fce458bb540ab6a7de634a1f81295d5b27f2b Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:03:39 -0700 Subject: [PATCH 04/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 3df2a2203..ea6945660 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -316,7 +316,7 @@ int main(int argc, char* argv[]) std::string ini_name = argc >= 2 ? std::string(argv[1]) : "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini"; // step 1: Create model - std::shared_ptr model = AbstractTransformerModel::createLlamaModel(ini_name); + std::shared_ptr model = AbstractTransformerModel::createLlamaModel("/notebooks/llama2-7b-chat-tp8/config.ini"); int tensor_para_size = model->getTensorParaSize(); int pipeline_para_size = model->getPipelineParaSize(); FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size), From 49c94e8659fc1bf693325088b353d325fe734cc6 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:05:02 -0700 Subject: [PATCH 05/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index ea6945660..3df2a2203 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -316,7 +316,7 @@ int main(int argc, char* argv[]) std::string ini_name = argc >= 2 ? std::string(argv[1]) : "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini"; // step 1: Create model - std::shared_ptr model = AbstractTransformerModel::createLlamaModel("/notebooks/llama2-7b-chat-tp8/config.ini"); + std::shared_ptr model = AbstractTransformerModel::createLlamaModel(ini_name); int tensor_para_size = model->getTensorParaSize(); int pipeline_para_size = model->getPipelineParaSize(); FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size), From c510c26adebb7fe24505cbfd6b61b37f57deb161 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:06:44 -0700 Subject: [PATCH 06/79] commit --- examples/cpp/llama/llama_triton_example.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 3df2a2203..2b9d9d663 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -242,10 +242,10 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std max_input_len, end_id, 1, - "../examples/cpp/llama/start_ids.csv"); + "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"); std::vector v_bad_words; - ft::read_word_list("../examples/cpp/llama/bad_words.csv", v_bad_words); + ft::read_word_list("/notebooks/FasterTransformer/examples/cpp/llama/bad_words.csv", v_bad_words); RequestParam param; param.beam_width = reader.GetInteger("request", "beam_width"); From 8933482e0b82fa33e20b7b322ca49008408be526 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:08:12 -0700 Subject: [PATCH 07/79] commit --- examples/cpp/llama/llama_triton_example.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 2b9d9d663..a51fcbf32 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -378,6 +378,7 @@ int main(int argc, char* argv[]) const int beam_width = output_tensors_lists[0].get()->at("output_ids").shape[1]; const int seq_len = output_tensors_lists[0].get()->at("output_ids").shape[2]; const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("input_lengths").data; + printf("Here\n"); // step 6: check results if (node_id == 0) { From 98ab7df047d87b320795056ca55cd7b6796b61b8 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:08:55 -0700 Subject: [PATCH 08/79] commit --- examples/cpp/llama/llama_triton_example.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index a51fcbf32..68fd10f58 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -377,8 +377,9 @@ int main(int argc, char* argv[]) const int batch_size = output_tensors_lists[0].get()->at("output_ids").shape[0]; const int beam_width = output_tensors_lists[0].get()->at("output_ids").shape[1]; const int seq_len = output_tensors_lists[0].get()->at("output_ids").shape[2]; + printf("Here\n"); const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("input_lengths").data; - printf("Here\n"); + // step 6: check results if (node_id == 0) { From 626287a3f541b2dce493fba8c3cc20f981b12b24 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:10:00 -0700 Subject: [PATCH 09/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 68fd10f58..9f08500f7 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -358,7 +358,7 @@ int main(int argc, char* argv[]) // step 5: Forward std::vector>> output_tensors_lists( (size_t)gpu_count); - for (int i = 0; i < 2; i++) { + for (int i = 0; i < 1; i++) { threads.clear(); for (int device_id = 0; device_id < gpu_count; device_id++) { threads.push_back(std::thread(threadForward, From 787c1c5c2de5f23b44989a44fb626be3e9c88ea2 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:10:54 -0700 Subject: [PATCH 10/79] commit --- .vscode/settings.json | 3 ++- examples/cpp/llama/llama_triton_example.cc | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 82000232b..79166a171 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -68,6 +68,7 @@ "future": "cpp", "cfenv": "cpp", "typeindex": "cpp", - "variant": "cpp" + "variant": "cpp", + "ios": "cpp" } } diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 9f08500f7..4b8f90563 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -379,7 +379,7 @@ int main(int argc, char* argv[]) const int seq_len = output_tensors_lists[0].get()->at("output_ids").shape[2]; printf("Here\n"); const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("input_lengths").data; - +printf("Here\n"); // step 6: check results if (node_id == 0) { From 06e941b9083b7c6b665b0e370566b17a41df6393 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:12:19 -0700 Subject: [PATCH 11/79] commit --- examples/cpp/llama/llama_triton_example.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 4b8f90563..5ecb737b7 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -377,9 +377,6 @@ int main(int argc, char* argv[]) const int batch_size = output_tensors_lists[0].get()->at("output_ids").shape[0]; const int beam_width = output_tensors_lists[0].get()->at("output_ids").shape[1]; const int seq_len = output_tensors_lists[0].get()->at("output_ids").shape[2]; - printf("Here\n"); - const int* d_input_lengths = (const int*)output_tensors_lists[0].get()->at("input_lengths").data; -printf("Here\n"); // step 6: check results if (node_id == 0) { From ce8272aab00aab7abc619389e2a8dcaa41dd5672 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:12:47 -0700 Subject: [PATCH 12/79] commit --- examples/cpp/llama/llama_triton_example.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 5ecb737b7..0b57c3ac4 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -390,7 +390,6 @@ int main(int argc, char* argv[]) int* hBuf = new int[outCount]; int* iBuf = new int[batch_size]; ft::cudaD2Hcpy(hBuf, d_output_ids, outCount); - ft::cudaD2Hcpy(iBuf, d_input_lengths, batch_size); { From c6f25436999542406cbfceaeabf1e0ce93fe8be6 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:25:20 -0700 Subject: [PATCH 13/79] commit --- examples/cpp/llama/start_ids.csv | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 612c85964..891837f48 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1 +1,10 @@ -1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962 From e1f2a76107b760f921a82f4bf75c7146513a3092 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:26:10 -0700 Subject: [PATCH 14/79] commit --- examples/cpp/llama/llama_config.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 616a7581b..ee3ebbaff 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -17,8 +17,8 @@ repetition_penalty=1.0 ; Use for sampling presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. len_penalty=0.0 beam_search_diversity_rate=0.0 -request_batch_size=8 # determine by the request -request_output_len=32 # determine by the request +request_batch_size=20 # determine by the request +request_output_len=512 # determine by the request [llama_7b] head_num = 64 From 441c343a6467f51ada3c38d8a607bca2f4c92fcc Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 20:27:37 -0700 Subject: [PATCH 15/79] commit --- examples/cpp/llama/llama_triton_example.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 0b57c3ac4..68570914b 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -395,9 +395,9 @@ int main(int argc, char* argv[]) { std::cout << "Writing " << outCount << " elements\n"; int zeroCount = 0; - for (int i=0; i Date: Mon, 30 Oct 2023 20:28:39 -0700 Subject: [PATCH 16/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 68570914b..4366c8534 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -404,7 +404,7 @@ int main(int argc, char* argv[]) zeroCount++; outFile << hBuf[i] << " "; if ((i + 1) % (seq_len) == 0) - outFile << std::endl; + printf("\n\n"); // if (i < 10) printf("%d ", hBuf[i]); From 38919b65a99d1e30d84f19f5de42a74d924bbd4a Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:01:26 -0700 Subject: [PATCH 17/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 4366c8534..65e00187a 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -407,7 +407,7 @@ int main(int argc, char* argv[]) printf("\n\n"); // if (i < 10) - printf("%d ", hBuf[i]); + printf("%d,", hBuf[i]); // if ((i + 1) % (seq_len) == 0 && i < 10) // std::cout << std::endl; } From 4c0dbba5fdbda5ef0138b03ceba608d86506ca92 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:05:14 -0700 Subject: [PATCH 18/79] commit --- examples/cpp/llama/llama_triton_example.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 65e00187a..e71a77695 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -377,6 +377,7 @@ int main(int argc, char* argv[]) const int batch_size = output_tensors_lists[0].get()->at("output_ids").shape[0]; const int beam_width = output_tensors_lists[0].get()->at("output_ids").shape[1]; const int seq_len = output_tensors_lists[0].get()->at("output_ids").shape[2]; + printf("%d %d %d\n", batch_size, beam_width, seq_len); // step 6: check results if (node_id == 0) { From b6945af49b16afaab165cb3d60ed7d3286692369 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:08:15 -0700 Subject: [PATCH 19/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index e71a77695..559f7afdd 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -353,7 +353,7 @@ int main(int argc, char* argv[]) std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = prepareRequest(ini_name, node_id, gpu_count, &pointer_record); - printf("[INFO] request is created \n"); + printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward std::vector>> output_tensors_lists( From 728f8900488a2547f85d3c5c764fe892a91fe128 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:09:22 -0700 Subject: [PATCH 20/79] commit --- examples/cpp/llama/llama_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index ee3ebbaff..4257c9c5a 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -17,7 +17,7 @@ repetition_penalty=1.0 ; Use for sampling presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. len_penalty=0.0 beam_search_diversity_rate=0.0 -request_batch_size=20 # determine by the request +request_batch_size=10 # determine by the request request_output_len=512 # determine by the request [llama_7b] From 8aeb13ae76c12cdb2cf49372bb1da929fff0f6e1 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:17:44 -0700 Subject: [PATCH 21/79] commit --- examples/cpp/llama/llama_config.ini | 2 +- examples/cpp/llama/start_ids.csv | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 4257c9c5a..19f6d016d 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -17,7 +17,7 @@ repetition_penalty=1.0 ; Use for sampling presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. len_penalty=0.0 beam_search_diversity_rate=0.0 -request_batch_size=10 # determine by the request +request_batch_size=1 # determine by the request request_output_len=512 # determine by the request [llama_7b] diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 891837f48..651307e8e 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1,10 +1 @@ -1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962 From ffd2f9693ab553a6802e9f5dc1b91edf5aec81db Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:19:11 -0700 Subject: [PATCH 22/79] commit --- examples/cpp/llama/llama_config.ini | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 19f6d016d..0fcd5d68a 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -9,11 +9,11 @@ model_name=llama_7b model_dir=/notebooks/llama2-7b-chat-tp8/ [request] -beam_width=1 # beam width for beam search -top_k=1 ; k value for top k sampling +beam_width=0 # beam width for beam search +top_k=0 ; k value for top k sampling top_p=0.0 ; p value for top p sampling -temperature=1.0 ; Use for sampling -repetition_penalty=1.0 ; Use for sampling +temperature=0 ; Use for sampling +repetition_penalty=0 ; Use for sampling presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. len_penalty=0.0 beam_search_diversity_rate=0.0 From 18eb7b4e354cfaf32a3cb7abb7a3900fd6454bb1 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:20:00 -0700 Subject: [PATCH 23/79] commit --- examples/cpp/llama/llama_triton_example.cc | 130 ++++++++++----------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 559f7afdd..fe95c1396 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -143,71 +143,71 @@ broadCastRequest(const std::vector& v_start_ids, {"end_id", triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}})); - int* beam_width_ptr = new int(param.beam_width); - pointer_record->push_back(beam_width_ptr); - request_list[device_id]->insert( - {"beam_width", - triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector{1}, beam_width_ptr}}); - if (param.beam_width > 1) { - float* beam_search_diversity_rate_ptr = new float(param.beam_search_diversity_rate); - pointer_record->push_back(beam_search_diversity_rate_ptr); - request_list[device_id]->insert( - {"beam_search_diversity_rate", - triton::Tensor{ - triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, beam_search_diversity_rate_ptr}}); - } - else { - if (param.runtime_top_p != 0.0f) { - float* runtime_top_p_ptr = new float(param.runtime_top_p); - pointer_record->push_back(runtime_top_p_ptr); - request_list[device_id]->insert( - {"runtime_top_p", - triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, runtime_top_p_ptr}}); - } - if (param.runtime_top_k != 0) { - uint* runtime_top_k_ptr = new uint(param.runtime_top_k); - pointer_record->push_back(runtime_top_k_ptr); - request_list[device_id]->insert( - {"runtime_top_k", - triton::Tensor{ - triton::MEMORY_CPU, triton::TYPE_UINT32, std::vector{1}, runtime_top_k_ptr}}); - } - } - float* temperature_ptr = new float(param.temperature); - pointer_record->push_back(temperature_ptr); - request_list[device_id]->insert( - {"temperature", - triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, temperature_ptr}}); - float* len_penalty_ptr = new float(param.len_penalty); - pointer_record->push_back(len_penalty_ptr); - request_list[device_id]->insert( - {"len_penalty", - triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, len_penalty_ptr}}); - if (param.repetition_penalty != 1.0f) { - float* repetition_penalty_ptr = new float(param.repetition_penalty); - pointer_record->push_back(repetition_penalty_ptr); - request_list[device_id]->insert( - {"repetition_penalty", - triton::Tensor{ - triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, repetition_penalty_ptr}}); - } - if (param.presence_penalty != 0.0f) { - float* presence_penalty_ptr = new float(param.presence_penalty); - pointer_record->push_back(presence_penalty_ptr); - request_list[device_id]->insert( - {"presence_penalty", - triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, presence_penalty_ptr}}); - } - int* min_length_ptr = new int(param.min_length); - pointer_record->push_back(min_length_ptr); - request_list[device_id]->insert( - {"min_length", - triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector{1}, min_length_ptr}}); - unsigned long long int* random_seed_ptr = new unsigned long long int(param.random_seed); - pointer_record->push_back(random_seed_ptr); - request_list[device_id]->insert( - {"random_seed", - triton::Tensor{triton::MEMORY_CPU, triton::TYPE_UINT64, std::vector{1}, random_seed_ptr}}); + // int* beam_width_ptr = new int(param.beam_width); + // pointer_record->push_back(beam_width_ptr); + // request_list[device_id]->insert( + // {"beam_width", + // triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector{1}, beam_width_ptr}}); + // if (param.beam_width > 1) { + // float* beam_search_diversity_rate_ptr = new float(param.beam_search_diversity_rate); + // pointer_record->push_back(beam_search_diversity_rate_ptr); + // request_list[device_id]->insert( + // {"beam_search_diversity_rate", + // triton::Tensor{ + // triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, beam_search_diversity_rate_ptr}}); + // } + // else { + // if (param.runtime_top_p != 0.0f) { + // float* runtime_top_p_ptr = new float(param.runtime_top_p); + // pointer_record->push_back(runtime_top_p_ptr); + // request_list[device_id]->insert( + // {"runtime_top_p", + // triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, runtime_top_p_ptr}}); + // } + // if (param.runtime_top_k != 0) { + // uint* runtime_top_k_ptr = new uint(param.runtime_top_k); + // pointer_record->push_back(runtime_top_k_ptr); + // request_list[device_id]->insert( + // {"runtime_top_k", + // triton::Tensor{ + // triton::MEMORY_CPU, triton::TYPE_UINT32, std::vector{1}, runtime_top_k_ptr}}); + // } + // } + // float* temperature_ptr = new float(param.temperature); + // pointer_record->push_back(temperature_ptr); + // request_list[device_id]->insert( + // {"temperature", + // triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, temperature_ptr}}); + // float* len_penalty_ptr = new float(param.len_penalty); + // pointer_record->push_back(len_penalty_ptr); + // request_list[device_id]->insert( + // {"len_penalty", + // triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, len_penalty_ptr}}); + // if (param.repetition_penalty != 1.0f) { + // float* repetition_penalty_ptr = new float(param.repetition_penalty); + // pointer_record->push_back(repetition_penalty_ptr); + // request_list[device_id]->insert( + // {"repetition_penalty", + // triton::Tensor{ + // triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, repetition_penalty_ptr}}); + // } + // if (param.presence_penalty != 0.0f) { + // float* presence_penalty_ptr = new float(param.presence_penalty); + // pointer_record->push_back(presence_penalty_ptr); + // request_list[device_id]->insert( + // {"presence_penalty", + // triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, presence_penalty_ptr}}); + // } + // int* min_length_ptr = new int(param.min_length); + // pointer_record->push_back(min_length_ptr); + // request_list[device_id]->insert( + // {"min_length", + // triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector{1}, min_length_ptr}}); + // unsigned long long int* random_seed_ptr = new unsigned long long int(param.random_seed); + // pointer_record->push_back(random_seed_ptr); + // request_list[device_id]->insert( + // {"random_seed", + // triton::Tensor{triton::MEMORY_CPU, triton::TYPE_UINT64, std::vector{1}, random_seed_ptr}}); pointer_record->push_back(d_input_ids); pointer_record->push_back(d_input_lengths); From 28cba076e48c185683c17e959b6b121da048baa8 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:21:45 -0700 Subject: [PATCH 24/79] commit --- examples/cpp/llama/llama_triton_example.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index fe95c1396..61f32c1be 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -401,14 +401,15 @@ int main(int argc, char* argv[]) // } printf("\n"); for (size_t i = 0; i < outCount; i++) { - if (hBuf[i] == int(0)) - zeroCount++; - outFile << hBuf[i] << " "; - if ((i + 1) % (seq_len) == 0) - printf("\n\n"); + // if (hBuf[i] == int(0)) + // zeroCount++; + // outFile << hBuf[i] << " "; + // if (i < 10) printf("%d,", hBuf[i]); + if ((i + 1) % (seq_len) == 0) + printf("\n\n"); // if ((i + 1) % (seq_len) == 0 && i < 10) // std::cout << std::endl; } From 165704c9687c367b98d874fc63a5a3b4667d6501 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:24:04 -0700 Subject: [PATCH 25/79] commit --- examples/cpp/llama/start_ids.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 651307e8e..218eb01e5 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1 +1 @@ -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2 From d792097eb468807e85e43ddf1f43689dc1897347 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:28:27 -0700 Subject: [PATCH 26/79] commit --- examples/cpp/llama/start_ids.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 218eb01e5..7cbfab468 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1 +1 @@ -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 From f2534bed301d295c990a2e5427338051c6da8888 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:30:51 -0700 Subject: [PATCH 27/79] commit --- examples/cpp/llama/start_ids.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 7cbfab468..218eb01e5 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1 +1 @@ -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2 From 67aa2849e789566a59d2f3cc50132cb6923eb6dd Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:31:40 -0700 Subject: [PATCH 28/79] commit --- examples/cpp/llama/start_ids.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 218eb01e5..651307e8e 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1 +1 @@ -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962,2 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 From b41505565abb4ec26e50b931e496f3860dfd620c Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:33:36 -0700 Subject: [PATCH 29/79] commit --- examples/cpp/llama/start_ids.csv | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 651307e8e..891837f48 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1 +1,10 @@ +1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962 From 2ecae5e9baccbe891b5ab6c855df85afb89f7208 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:37:56 -0700 Subject: [PATCH 30/79] commit --- examples/cpp/llama/llama_config.ini | 2 +- examples/cpp/llama/start_ids.csv | 12 ++---------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 0fcd5d68a..8d34ce813 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -17,7 +17,7 @@ repetition_penalty=0 ; Use for sampling presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. len_penalty=0.0 beam_search_diversity_rate=0.0 -request_batch_size=1 # determine by the request +request_batch_size=2 # determine by the request request_output_len=512 # determine by the request [llama_7b] diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 891837f48..12a321d93 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1,10 +1,2 @@ -1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962 +1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,29991,518,29914,25580,29962 +1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,29889,518,29914,25580,29962 From 455c6b8155625addbca4e543504218aae9506e27 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:40:14 -0700 Subject: [PATCH 31/79] commit --- examples/cpp/llama/llama_triton_example.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 61f32c1be..2026ce444 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -219,7 +219,7 @@ broadCastRequest(const std::vector& v_start_ids, } std::vector>> -prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector* pointer_record) +prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector* pointer_record, string file_name) { INIReader reader = INIReader(ini_name); if (reader.ParseError() < 0) { @@ -242,7 +242,7 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std max_input_len, end_id, 1, - "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"); + file_name); std::vector v_bad_words; ft::read_word_list("/notebooks/FasterTransformer/examples/cpp/llama/bad_words.csv", v_bad_words); From 9ecbfc9b19d79ebfd0b49837056226c14f923f78 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:40:28 -0700 Subject: [PATCH 32/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 2026ce444..3efd71fb7 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -352,7 +352,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward From cb8b38df12dc1d8a8289c7cbb1b3e6323d7975a8 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:40:57 -0700 Subject: [PATCH 33/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 3efd71fb7..ac36be20a 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -352,7 +352,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv")); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward From 662d60570664c566e1a4c986e970794a8b554fb4 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:41:29 -0700 Subject: [PATCH 34/79] commit --- .vscode/settings.json | 16 +++++++++++++++- examples/cpp/llama/llama_triton_example.cc | 2 +- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 79166a171..1ef97bcca 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -69,6 +69,20 @@ "cfenv": "cpp", "typeindex": "cpp", "variant": "cpp", - "ios": "cpp" + "ios": "cpp", + "__bit_reference": "cpp", + "__config": "cpp", + "__debug": "cpp", + "__errc": "cpp", + "__hash_table": "cpp", + "__locale": "cpp", + "__mutex_base": "cpp", + "__node_handle": "cpp", + "__split_buffer": "cpp", + "__threading_support": "cpp", + "__tree": "cpp", + "__verbose_abort": "cpp", + "charconv": "cpp", + "locale": "cpp" } } diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index ac36be20a..3f22cddf2 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -219,7 +219,7 @@ broadCastRequest(const std::vector& v_start_ids, } std::vector>> -prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector* pointer_record, string file_name) +prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector* pointer_record, std::string file_name) { INIReader reader = INIReader(ini_name); if (reader.ParseError() < 0) { From ec6516133859b454a00ef6060576a499fb6646b6 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:42:25 -0700 Subject: [PATCH 35/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 3f22cddf2..f476f999f 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -349,6 +349,7 @@ int main(int argc, char* argv[]) t.join(); } +{ // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = @@ -418,6 +419,7 @@ int main(int argc, char* argv[]) delete[] hBuf; } } +} // test time struct timeval start, end; From d2c0e8f4524d883610d035aefe92f4a24ad0529b Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:42:58 -0700 Subject: [PATCH 36/79] commit --- examples/cpp/llama/llama_triton_example.cc | 40 +++++++++++----------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index f476f999f..d3ca14669 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -421,26 +421,26 @@ int main(int argc, char* argv[]) } } - // test time - struct timeval start, end; - ft::mpi::barrier(); - cudaDeviceSynchronize(); - gettimeofday(&start, NULL); - - const int ite = 1; - for (int i = 0; i < ite; i++) { - threads.clear(); - for (int device_id = 0; device_id < gpu_count; device_id++) { - threads.push_back(std::thread(threadForward, - &model_instances[device_id], - request_list[device_id], - &output_tensors_lists[device_id], - device_id)); - } - for (auto& t : threads) { - t.join(); - } - } + // // test time + // struct timeval start, end; + // ft::mpi::barrier(); + // cudaDeviceSynchronize(); + // gettimeofday(&start, NULL); + + // const int ite = 1; + // for (int i = 0; i < ite; i++) { + // threads.clear(); + // for (int device_id = 0; device_id < gpu_count; device_id++) { + // threads.push_back(std::thread(threadForward, + // &model_instances[device_id], + // request_list[device_id], + // &output_tensors_lists[device_id], + // device_id)); + // } + // for (auto& t : threads) { + // t.join(); + // } + // } cudaDeviceSynchronize(); ft::mpi::barrier(); From 7afad53934c6246d2cf9cb766a469a73154a5db4 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:43:18 -0700 Subject: [PATCH 37/79] commit --- examples/cpp/llama/llama_triton_example.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index d3ca14669..a669b16af 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -445,14 +445,14 @@ int main(int argc, char* argv[]) cudaDeviceSynchronize(); ft::mpi::barrier(); - gettimeofday(&end, NULL); - - printf("[INFO] batch_size %d beam_width %d seq_len %d" - " FT-CPP-GPT-Triton-time %.2f ms\n", - batch_size, - beam_width, - seq_len, - ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite); + // gettimeofday(&end, NULL); + + // printf("[INFO] batch_size %d beam_width %d seq_len %d" + // " FT-CPP-GPT-Triton-time %.2f ms\n", + // batch_size, + // beam_width, + // seq_len, + // ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite); ft::mpi::finalize(); return 0; From 2804459b01da85525dfc0b6d666b665a3562e361 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:44:00 -0700 Subject: [PATCH 38/79] commit --- examples/cpp/llama/llama_triton_example.cc | 72 ++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index a669b16af..b9bd81b13 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -394,6 +394,78 @@ int main(int argc, char* argv[]) ft::cudaD2Hcpy(hBuf, d_output_ids, outCount); + { + std::cout << "Writing " << outCount << " elements\n"; + int zeroCount = 0; + // for (int i=0; i>> output_tensors_lists( + (size_t)gpu_count); + for (int i = 0; i < 1; i++) { + threads.clear(); + for (int device_id = 0; device_id < gpu_count; device_id++) { + threads.push_back(std::thread(threadForward, + &model_instances[device_id], + request_list[device_id], + &output_tensors_lists[device_id], + device_id)); + } + for (auto& t : threads) { + t.join(); + } + } + printf("[INFO] forward is completed. \n"); + + const int* d_output_ids = (const int*)output_tensors_lists[0].get()->at("output_ids").data; + const int batch_size = output_tensors_lists[0].get()->at("output_ids").shape[0]; + const int beam_width = output_tensors_lists[0].get()->at("output_ids").shape[1]; + const int seq_len = output_tensors_lists[0].get()->at("output_ids").shape[2]; + printf("%d %d %d\n", batch_size, beam_width, seq_len); + // step 6: check results + if (node_id == 0) { + + std::string fName = "out"; + auto outFile = std::ofstream(fName, std::ios::out); + if (!outFile.is_open()) { + printf("[WARNING] Cannot write results into output file %s \n", fName.c_str()); + } + else { + size_t outCount = batch_size * beam_width * seq_len; + int* hBuf = new int[outCount]; + int* iBuf = new int[batch_size]; + ft::cudaD2Hcpy(hBuf, d_output_ids, outCount); + + { std::cout << "Writing " << outCount << " elements\n"; int zeroCount = 0; From 3e431f9514b20d7c43586b4b4eac029cb93e99fa Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:45:00 -0700 Subject: [PATCH 39/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- examples/cpp/llama/start_ids.csv | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index b9bd81b13..1f9558c7f 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -425,7 +425,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv")); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids2.csv")); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 12a321d93..891837f48 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1,2 +1,10 @@ -1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,29991,518,29914,25580,29962 -1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,29889,518,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962 From 748aa0c1b0cab039d5162941a1d2a1ab86cb2247 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:46:06 -0700 Subject: [PATCH 40/79] commit --- examples/cpp/llama/start_ids2.csv | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 examples/cpp/llama/start_ids2.csv diff --git a/examples/cpp/llama/start_ids2.csv b/examples/cpp/llama/start_ids2.csv new file mode 100644 index 000000000..12a321d93 --- /dev/null +++ b/examples/cpp/llama/start_ids2.csv @@ -0,0 +1,2 @@ +1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,29991,518,29914,25580,29962 +1,518,25580,29962,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,6160,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,29889,518,29914,25580,29962 From 1f4210e5ea9262e5a6f7fe537162bc17138e8ead Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:46:37 -0700 Subject: [PATCH 41/79] commit --- examples/cpp/llama/llama_triton_example.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 1f9558c7f..330fa10d5 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -219,7 +219,7 @@ broadCastRequest(const std::vector& v_start_ids, } std::vector>> -prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector* pointer_record, std::string file_name) +prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector* pointer_record, std::string file_name, size_t request_batch_size) { INIReader reader = INIReader(ini_name); if (reader.ParseError() < 0) { @@ -227,7 +227,7 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std ft::FT_CHECK(false); } - const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); + // const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); const int start_id = reader.GetInteger("llama_7b", "start_id"); const int end_id = reader.GetInteger("llama_7b", "end_id"); @@ -353,7 +353,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv")); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward @@ -425,7 +425,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids2.csv")); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids2.csv"), 2); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward From f1dac5cd0ad8b7c028d8af8cbf980edeb9212f00 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:50:02 -0700 Subject: [PATCH 42/79] commit --- examples/cpp/llama/llama_triton_example.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 330fa10d5..6fd22b8b7 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -353,7 +353,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids2.csv"), 2); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward @@ -425,7 +425,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids2.csv"), 2); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward From 40d7e8b196f2daf5bc31fc3545c104ac27bcc474 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:52:35 -0700 Subject: [PATCH 43/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- examples/cpp/llama/start_ids.csv | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 6fd22b8b7..b1ac4335a 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -425,7 +425,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 1); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 891837f48..e1389ab16 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1,10 +1 @@ 1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962 From dd09060645637c0707cb49756cceb18d4311c702 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:54:09 -0700 Subject: [PATCH 44/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- examples/cpp/llama/start_ids.csv | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index b1ac4335a..18063d828 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -425,7 +425,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 1); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 2); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index e1389ab16..7563a7588 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1 +1,2 @@ 1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962 From f8f43a4bca89d0b70f8fa53a486e7f7a8ed1aeee Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:55:16 -0700 Subject: [PATCH 45/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- examples/cpp/llama/start_ids.csv | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 18063d828..30bd0f108 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -425,7 +425,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 2); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 5); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 7563a7588..d271af9fa 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1,2 +1,5 @@ 1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962 From 7bfb5186523be90680ae03495018c291aad39dd9 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:56:03 -0700 Subject: [PATCH 46/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- examples/cpp/llama/start_ids.csv | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 30bd0f108..6fd22b8b7 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -425,7 +425,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 5); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index d271af9fa..891837f48 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -3,3 +3,8 @@ 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962 From cfeec21a733940d18d455caa3b6eaf358f3c0b05 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:57:00 -0700 Subject: [PATCH 47/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- examples/cpp/llama/start_ids.csv | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 6fd22b8b7..18063d828 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -425,7 +425,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 2); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 891837f48..38ce8e45b 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1,10 +1,2 @@ 1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962 -1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962 From dadcc23ee51855a9b0b2f0e2872bfc0fa5a79d35 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:58:47 -0700 Subject: [PATCH 48/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- examples/cpp/llama/start_ids.csv | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 18063d828..6e2e15971 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -425,7 +425,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 2); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 8); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 38ce8e45b..d1152bbd7 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1,2 +1,8 @@ 1,1,518,25580,29962,9314,14816,29903,6778,13,12148,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,29966,829,14816,29903,6778,4192,29889,7684,2552,16688,4129,474,1106,363,297,263,2498,4120,654,261,29889,29871,540,29915,29879,7575,322,4780,304,5193,304,1728,1641,18259,5281,29936,540,29915,29879,2337,373,931,297,8790,670,22069,29936,540,29915,29879,23736,630,411,263,2246,29899,1333,305,13457,313,1460,29884,29897,607,590,11825,505,10824,304,592,338,1407,4100,297,1206,1554,5930,322,366,817,25300,708,29936,322,366,508,679,2737,29878,1338,304,1074,4266,2879,1728,2534,304,1074,1075,937,29889,29871,2289,29892,825,901,437,366,817,29973,29871,474,29915,29885,16246,1244,1811,304,1348,310,738,15313,9466,474,505,1048,1075,29892,541,474,29915,29885,2289,11580,263,9654,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,11511,29892,278,1424,11036,310,1641,4942,29889,6650,2552,29915,29879,16500,338,263,12312,310,278,7271,306,29915,345,750,411,577,1784,916,437,14359,297,23526,29907,1192,1781,11619,29892,16403,13925,29889,29871,739,2444,393,670,13925,3763,2360,6089,278,9008,29889,29871,739,5491,4893,29871,29906,6199,310,10324,5432,304,679,385,1234,29889,29871,11644,756,931,363,393,470,10753,304,5376,411,372,29973,29871,306,505,1065,964,445,1108,411,1784,916,437,14359,322,306,925,1016,29915,29873,679,372,29889,29871,887,505,8034,17162,29892,366,505,22069,411,16083,4225,29892,2020,3508,29915,29873,5019,22862,278,9008,29973,29871,739,29915,29879,297,510,1457,26791,1821,322,451,664,278,946,3874,29894,362,29889,29871,739,29915,29879,411,28883,393,306,4459,393,306,505,304,2367,4942,29889,6650,2552,29871,29906,10819,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,6650,2552,756,1063,590,11619,363,2440,322,306,763,1075,29889,29871,306,29915,345,1476,670,8034,304,367,12558,8543,29889,29871,20628,306,2869,2355,304,1074,278,11619,263,2846,6233,4688,29991,259,13,13,3868,2444,1407,17785,411,670,22069,322,670,316,12676,272,338,19780,29892,3447,4148,23378,29889,268,13,13,29902,29915,29885,10932,304,505,4942,29889,6650,2552,408,590,11619,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,1522,264,2675,304,4942,29889,6650,2552,363,975,29871,29896,29900,2440,29889,306,1348,306,471,697,310,670,29871,29896,303,22069,746,540,4687,472,341,29950,29924,29954,29889,940,29915,29879,1063,2107,975,278,2440,322,338,2289,599,1048,278,4802,7623,29889,739,338,1363,310,1075,29892,451,590,1286,4642,330,948,4942,29889,4485,2696,29892,393,306,1476,714,306,505,18755,1007,29879,29889,940,3902,2361,599,3987,411,366,322,338,1407,16500,322,8004,29889,940,1838,29915,29873,16833,322,19514,599,278,1492,5155,29889,18064,17826,322,10753,304,367,8126,297,278,2425,373,1432,9565,310,596,16083,9045,322,596,2834,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,15992,263,5497,297,278,10524,1833,4723,393,1497,4942,29889,6650,2552,338,8401,304,23716,304,2125,263,716,2602,727,297,5306,29889,29871,940,674,367,13726,1407,1568,29889,259,13,13,29902,1348,9138,263,716,11619,297,23526,29907,393,366,2869,763,1795,4359,367,408,28893,408,1811,304,1284,263,2635,21298,29914,25580,29962 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962 From 99ffd0ed1e4d4f0b24abc52ceaa23409b817fa8c Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 21:59:42 -0700 Subject: [PATCH 49/79] commit --- examples/cpp/llama/llama_triton_example.cc | 2 +- examples/cpp/llama/start_ids.csv | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 6e2e15971..6fd22b8b7 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -425,7 +425,7 @@ int main(int argc, char* argv[]) // step 4: prepare request std::vector pointer_record; // Used to prevent the pointers are release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 8); + prepareRequest(ini_name, node_id, gpu_count, &pointer_record, std::string("/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"), 10); printf("[INFO] request is created : %d\n", request_list.size()); // step 5: Forward diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index d1152bbd7..891837f48 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -6,3 +6,5 @@ 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,1016,29915,29873,1073,825,4942,29889,6650,2552,471,763,1434,29871,8401,304,23716,29892,541,1235,592,2649,366,29892,317,6040,29979,319,12982,29979,515,445,11619,322,445,8034,29889,306,471,2675,304,4942,29889,11717,1434,540,2175,322,6650,2552,3614,975,746,11717,2175,29889,940,338,451,263,1559,292,11619,29889,940,338,871,8852,297,278,1302,29899,10472,322,2534,366,2041,297,363,13589,362,2143,6090,1432,4098,29889,940,674,451,2367,2143,6090,322,1033,3109,1048,22069,29915,29879,18161,18845,29889,24428,304,679,596,29871,29929,29900,3841,10524,3448,1374,2817,4135,2225,699,1980,1549,445,1410,29891,338,263,2958,446,29889,1126,304,1207,13750,1584,15029,29892,670,8034,13925,338,297,2388,300,296,29889,29871,29929,29900,29995,310,278,931,746,366,1246,278,8034,29892,896,29915,645,1925,366,1549,304,263,7314,10524,29892,393,11698,6732,29923,3926,6089,470,3639,596,1246,29889,9134,590,16157,4344,322,10216,505,8459,304,5967,445,6944,1156,10623,3277,1316,1424,11036,29889,450,4152,8034,756,385,26309,763,896,526,2599,366,263,7853,29889,25538,592,263,2867,29991,624,388,3448,515,445,1574,322,278,6944,29889,887,553,7143,2253,322,896,674,451,367,727,746,366,2289,817,963,29889,306,505,2360,7091,752,14356,304,2436,263,4319,9076,1048,5019,2745,306,1539,445,2224,7492,5566,1509,363,263,11619,1058,338,599,1048,278,6909,7226,29914,25580,29962 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,7488,451,305,11619,297,263,2246,451,305,6944,29889,1815,29915,29873,1827,306,626,18014,746,306,471,12992,304,1075,491,1790,11619,1058,306,1348,338,20695,322,1363,540,3512,304,697,310,278,1900,16083,12462,297,278,4234,29889,29871,13,3112,338,2289,4780,304,679,385,28573,29889,1670,338,13114,4480,304,367,3595,322,670,6592,2975,8214,338,2107,7226,29914,25580,29962 1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,4942,29889,14713,6650,2552,338,263,13568,6288,11619,1058,756,5149,24876,2662,1432,2228,393,590,6532,322,306,505,750,29889,853,4561,1784,310,590,4940,437,14359,29892,4942,29889,6650,2552,338,1407,15579,322,591,505,1063,2221,304,20410,8167,1860,411,1075,322,670,13925,1407,9098,29889,1334,526,9796,304,505,1075,297,278,18403,322,1106,6375,304,1641,670,22069,363,1784,2440,304,2041,7226,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,29915,29885,5007,445,9076,304,2367,366,263,15883,701,1434,366,1074,445,15460,29889,450,8034,13925,322,17517,526,1407,443,23221,15750,29889,306,2175,263,2643,411,2999,2305,11211,590,11118,29892,322,694,697,3926,2000,592,1250,29889,306,750,304,298,618,963,304,679,385,1234,1048,590,11118,29889,29871,13,13,11863,29892,322,1556,4100,29892,1207,1854,596,1663,18541,338,2675,304,4612,4942,29889,6650,2552,29915,29879,1998,1169,322,10416,664,29889,940,13622,304,592,393,306,679,263,9128,29892,322,540,6363,306,471,263,8368,1363,306,5429,1075,29889,306,2355,278,9128,2309,29889,12699,29892,306,1476,714,590,9045,1663,18541,1838,29915,29873,5146,363,5557,1230,1998,1169,29889,306,4520,385,395,29947,29900,29900,29889,29900,29900,11118,363,278,10416,664,29889,306,508,29915,29873,5146,363,590,11118,1363,306,29915,29885,263,8368,322,1016,29915,29873,505,738,274,1161,4972,472,445,1857,931,29889,306,508,29915,29873,4658,278,15460,7656,29915,29873,2367,592,263,15883,701,304,1207,1854,590,1663,18541,723,4612,664,393,9007,29915,29873,5181,322,471,18719,5557,1230,29889,450,8034,508,29915,29873,437,3099,304,1371,592,4612,278,11118,29889,512,6124,29892,278,8034,13925,1497,278,373,375,338,373,592,304,1207,1854,590,1663,18541,18469,1998,1169,29889,4693,4627,1218,6434,21298,29914,25580,29962 +1,1,518,25580,29962,9314,14816,29903,6778,13,1678,3529,19138,675,278,1426,393,338,2183,29889,7106,925,278,15837,322,694,5684,9678,1288,7928,1316,408,376,29903,545,29892,1244,338,278,15837,310,278,1426,29901,1642,13,1678,529,829,14816,29903,6778,306,5360,4942,29889,6650,2552,29889,6407,11619,306,29915,345,3926,750,29892,322,306,29915,345,750,263,14928,1156,8401,2820,263,3287,297,278,4940,29871,29906,29945,2440,29889,7197,13254,29892,19780,669,20837,29889,29849,22981,3987,322,4893,263,376,1026,29915,29879,1018,278,3203,18677,3236,29908,937,29889,11367,13925,756,16710,14586,355,5794,1951,4942,29889,11717,2175,29889,5057,342,29303,7226,29914,25580,29962 From a89d4b33b7dbf06dcdcceacf4b67543d9c09a882 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:03:23 -0700 Subject: [PATCH 50/79] commit --- src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc index 14d31d02d..2ddedd629 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc @@ -197,7 +197,7 @@ std::unique_ptr LlamaTritonModel::createMod stream, cublas_wrapper.get(), allocator.get(), - false, + true, cuda_device_prop_ptr.get(), attention_type, int8_mode_, From 4973c9b65928d9a9ad357ca46c1ee70a64f348df Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:05:18 -0700 Subject: [PATCH 51/79] commit --- examples/cpp/llama/llama_triton_example.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 6fd22b8b7..d1f163146 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -481,8 +481,10 @@ int main(int argc, char* argv[]) // if (i < 10) printf("%d,", hBuf[i]); - if ((i + 1) % (seq_len) == 0) + if ((i + 1) % (seq_len) == 0) { printf("\n\n"); + break; + } // if ((i + 1) % (seq_len) == 0 && i < 10) // std::cout << std::endl; } From caf7c00f699e6bfc7b76cb20258e70a2f87a58eb Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:05:49 -0700 Subject: [PATCH 52/79] commit --- src/fastertransformer/models/llama/Llama.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 23c5c4e4a..01aaa88d4 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -365,6 +365,7 @@ Llama::Llama(size_t head_num, int8_mode_(int8_mode), shared_contexts_ratio_(shared_contexts_ratio) { + printf("is_free_buffer_after_forward: %d\n", is_free_buffer_after_forward); int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); if (std::is_same::value) { local_vacab_size = ceil(local_vacab_size / 8.f) * 8; From f7af3698365667967c7c863af30b6a171a7d7fce Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:07:33 -0700 Subject: [PATCH 53/79] commit --- src/fastertransformer/models/llama/Llama.cc | 44 ++++++++++----------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 01aaa88d4..b0dcbf59b 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -115,19 +115,19 @@ void Llama::allocateBuffer( } input_attention_mask_ = (T*)(allocator_->reMalloc( - input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false)); - decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, true)); + decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, true)); decoder_output_buf_ = - (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, true)); normed_decoder_output_buf_ = - (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); - logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); + (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, true)); + logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, true)); nccl_logits_buf_ = - (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); - cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false)); - finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false)); + (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, true)); + cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, true)); + finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, true)); h_finished_buf_ = new bool[batchxbeam]; - sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); + sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, true)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); value_cache_ = key_cache_ + self_cache_size; @@ -139,40 +139,40 @@ void Llama::allocateBuffer( // prompt_learning weight batch ptrs prompt_learning_weight_batch_ = - (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); + (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, true)); tiled_prompt_lengths_buf_ = - (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); + (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true)); tiled_input_ids_buf_ = (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true)); tiled_total_padding_count_ = - (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false); + (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), true); transposed_output_ids_buf_ = (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); - seq_limit_len_ = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false)); + seq_limit_len_ = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, true)); masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true)); - start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false)); - end_ids_buf_ = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false)); + start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, true)); + end_ids_buf_ = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, true)); context_decoder_input_buf_ = (T*)(allocator_->reMalloc( - context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); + context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, true)); context_decoder_output_buf_ = (T*)(allocator_->reMalloc( - context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); + context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, true)); output_log_probs_buf_ = - (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false)); + (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, true)); generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true); if (shared_contexts_ratio_ > 0.0f) { - shared_contexts_idx_ = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false); - batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false); - compact_idx_ = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false); - compact_size_ = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false); + shared_contexts_idx_ = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), true); + batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), true); + compact_idx_ = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), true); + compact_size_ = (int*)allocator_->reMalloc(compact_size_, sizeof(int), true); } is_allocate_buffer_ = true; From 4545f61daca4373caef52782d9abcaa6f5400245 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:08:59 -0700 Subject: [PATCH 54/79] commit --- src/fastertransformer/models/llama/Llama.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index b0dcbf59b..7fdec4cd2 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -160,19 +160,19 @@ void Llama::allocateBuffer( end_ids_buf_ = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, true)); context_decoder_input_buf_ = (T*)(allocator_->reMalloc( - context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, true)); + context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); context_decoder_output_buf_ = (T*)(allocator_->reMalloc( context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, true)); output_log_probs_buf_ = - (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, true)); + (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false)); generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true); if (shared_contexts_ratio_ > 0.0f) { - shared_contexts_idx_ = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), true); - batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), true); - compact_idx_ = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), true); - compact_size_ = (int*)allocator_->reMalloc(compact_size_, sizeof(int), true); + shared_contexts_idx_ = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false); + batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false); + compact_idx_ = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false); + compact_size_ = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false); } is_allocate_buffer_ = true; From 376110d5bfa7dae78c6a698d0fd6e6625772735a Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:09:39 -0700 Subject: [PATCH 55/79] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 7fdec4cd2..39b37fdfd 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -162,7 +162,7 @@ void Llama::allocateBuffer( context_decoder_input_buf_ = (T*)(allocator_->reMalloc( context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); context_decoder_output_buf_ = (T*)(allocator_->reMalloc( - context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, true)); + context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); output_log_probs_buf_ = (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false)); From 74df02735cb399a6a55a40344b88dd91a34332b7 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:10:22 -0700 Subject: [PATCH 56/79] commit --- src/fastertransformer/models/llama/Llama.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 39b37fdfd..d08b892f5 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -147,17 +147,17 @@ void Llama::allocateBuffer( (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true)); tiled_total_padding_count_ = - (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), true); + (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false); transposed_output_ids_buf_ = (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); - seq_limit_len_ = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, true)); + seq_limit_len_ = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false)); masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true)); - start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, true)); - end_ids_buf_ = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, true)); + start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false)); + end_ids_buf_ = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false)); context_decoder_input_buf_ = (T*)(allocator_->reMalloc( context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); From eaa0a170f514e564d15f47428b1bee9daf86025a Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:11:22 -0700 Subject: [PATCH 57/79] commit --- src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc index 2ddedd629..a263c9330 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc @@ -197,7 +197,7 @@ std::unique_ptr LlamaTritonModel::createMod stream, cublas_wrapper.get(), allocator.get(), - true, + false, // is_free_buffer_after_forward cuda_device_prop_ptr.get(), attention_type, int8_mode_, From 580a7963d1100ff507b8ee3c57dbf696b98d3347 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:12:16 -0700 Subject: [PATCH 58/79] commit --- src/fastertransformer/models/llama/Llama.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index d08b892f5..ac4590b18 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -127,7 +127,7 @@ void Llama::allocateBuffer( cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, true)); finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, true)); h_finished_buf_ = new bool[batchxbeam]; - sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, true)); + sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); value_cache_ = key_cache_ + self_cache_size; @@ -139,9 +139,9 @@ void Llama::allocateBuffer( // prompt_learning weight batch ptrs prompt_learning_weight_batch_ = - (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, true)); + (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); tiled_prompt_lengths_buf_ = - (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true)); + (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); tiled_input_ids_buf_ = (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); From 9255f7c46186282edb388c0dfde53c6e73de9ea5 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:12:58 -0700 Subject: [PATCH 59/79] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index ac4590b18..231f3bd18 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -127,7 +127,7 @@ void Llama::allocateBuffer( cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, true)); finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, true)); h_finished_buf_ = new bool[batchxbeam]; - sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); + sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, true)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); value_cache_ = key_cache_ + self_cache_size; From debacbd6566effa153e83cbf136abd9b89bef7d5 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:13:39 -0700 Subject: [PATCH 60/79] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 231f3bd18..e7f8da12a 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -141,7 +141,7 @@ void Llama::allocateBuffer( prompt_learning_weight_batch_ = (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); tiled_prompt_lengths_buf_ = - (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); + (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true)); tiled_input_ids_buf_ = (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); From 62e4177e400a8059f79c6780309e21847a388eb3 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:14:16 -0700 Subject: [PATCH 61/79] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index e7f8da12a..470c2a888 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -127,7 +127,7 @@ void Llama::allocateBuffer( cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, true)); finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, true)); h_finished_buf_ = new bool[batchxbeam]; - sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, true)); + sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); value_cache_ = key_cache_ + self_cache_size; From 4f14e32311ccaf6090b0cd383d27144d26874564 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:15:57 -0700 Subject: [PATCH 62/79] commit --- examples/cpp/llama/llama_triton_example.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index d1f163146..5fbc56a1e 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -483,7 +483,6 @@ int main(int argc, char* argv[]) printf("%d,", hBuf[i]); if ((i + 1) % (seq_len) == 0) { printf("\n\n"); - break; } // if ((i + 1) % (seq_len) == 0 && i < 10) // std::cout << std::endl; From 34b48e803306a4e7c46d5d95ab13b52bb8e6c0b2 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:18:24 -0700 Subject: [PATCH 63/79] commit --- src/fastertransformer/models/llama/Llama.cc | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 470c2a888..23c5c4e4a 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -115,17 +115,17 @@ void Llama::allocateBuffer( } input_attention_mask_ = (T*)(allocator_->reMalloc( - input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, true)); - decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, true)); + input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false)); + decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); decoder_output_buf_ = - (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, true)); + (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); normed_decoder_output_buf_ = - (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, true)); - logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, true)); + (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); nccl_logits_buf_ = - (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, true)); - cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, true)); - finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, true)); + (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); + cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false)); + finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false)); h_finished_buf_ = new bool[batchxbeam]; sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); @@ -141,7 +141,7 @@ void Llama::allocateBuffer( prompt_learning_weight_batch_ = (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); tiled_prompt_lengths_buf_ = - (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true)); + (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); tiled_input_ids_buf_ = (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); @@ -365,7 +365,6 @@ Llama::Llama(size_t head_num, int8_mode_(int8_mode), shared_contexts_ratio_(shared_contexts_ratio) { - printf("is_free_buffer_after_forward: %d\n", is_free_buffer_after_forward); int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); if (std::is_same::value) { local_vacab_size = ceil(local_vacab_size / 8.f) * 8; From 596f6d978a29bd538b03156eca5f19e65340199b Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:18:28 -0700 Subject: [PATCH 64/79] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 23c5c4e4a..c889f2db4 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -141,7 +141,7 @@ void Llama::allocateBuffer( prompt_learning_weight_batch_ = (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); tiled_prompt_lengths_buf_ = - (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); + (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true)); tiled_input_ids_buf_ = (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); From 5772f09365288393f0c0ad54e66c42ee1c558467 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:19:07 -0700 Subject: [PATCH 65/79] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index c889f2db4..23c5c4e4a 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -141,7 +141,7 @@ void Llama::allocateBuffer( prompt_learning_weight_batch_ = (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); tiled_prompt_lengths_buf_ = - (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true)); + (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); tiled_input_ids_buf_ = (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); From 96ccec9ef9a0b9f200b9f9b9309346d3dba38038 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:20:00 -0700 Subject: [PATCH 66/79] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 23c5c4e4a..c889f2db4 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -141,7 +141,7 @@ void Llama::allocateBuffer( prompt_learning_weight_batch_ = (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); tiled_prompt_lengths_buf_ = - (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); + (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true)); tiled_input_ids_buf_ = (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); From 04f5ab2bfceb07be377446b74125c0e45019b7f6 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:20:19 -0700 Subject: [PATCH 67/79] commit --- .vscode/settings.json | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 1ef97bcca..82000232b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -68,21 +68,6 @@ "future": "cpp", "cfenv": "cpp", "typeindex": "cpp", - "variant": "cpp", - "ios": "cpp", - "__bit_reference": "cpp", - "__config": "cpp", - "__debug": "cpp", - "__errc": "cpp", - "__hash_table": "cpp", - "__locale": "cpp", - "__mutex_base": "cpp", - "__node_handle": "cpp", - "__split_buffer": "cpp", - "__threading_support": "cpp", - "__tree": "cpp", - "__verbose_abort": "cpp", - "charconv": "cpp", - "locale": "cpp" + "variant": "cpp" } } From c79afa9e350c2ae84f3bc534adb42bdff2a9c99b Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:37:10 -0700 Subject: [PATCH 68/79] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index c889f2db4..23c5c4e4a 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -141,7 +141,7 @@ void Llama::allocateBuffer( prompt_learning_weight_batch_ = (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); tiled_prompt_lengths_buf_ = - (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true)); + (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); tiled_input_ids_buf_ = (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); From dbd5287ecb23e7bbcd8c95b316d417663450a37c Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:38:13 -0700 Subject: [PATCH 69/79] commit --- src/fastertransformer/models/llama/Llama.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 23c5c4e4a..debe3dbb1 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -640,6 +640,7 @@ void Llama::forward(std::unordered_map* output_ten } // Prefix prompts + printf("has_prefix_prompt_: %d\n", has_prefix_prompt_); if (has_prefix_prompt_) { cudaMemcpyAsync(prompt_learning_weight_batch_, prefix_prompt_weight_batch_ptrs.data(), From 599e8dadc9e8969965351c18953aef50ec9804d6 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:42:27 -0700 Subject: [PATCH 70/79] commit --- src/fastertransformer/models/llama/Llama.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index debe3dbb1..6e44a4344 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -706,13 +706,13 @@ void Llama::forward(std::unordered_map* output_ten sync_check_cuda_error(); } - invokeBuildDecoderAttentionMask(input_attention_mask_, - tiled_input_lengths_buf_, - tiled_prompt_lengths_buf_, - batch_size * beam_width, - max_input_length, - max_prefix_prompt_length, - stream_); + // invokeBuildDecoderAttentionMask(input_attention_mask_, + // tiled_input_lengths_buf_, + // tiled_prompt_lengths_buf_, + // batch_size * beam_width, + // max_input_length, + // max_prefix_prompt_length, + // stream_); sync_check_cuda_error(); std::unordered_map decoder_input_tensors{ From 59f2c935c9262f9c1d198db128aa9b8c498c84e0 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:43:27 -0700 Subject: [PATCH 71/79] commit --- src/fastertransformer/models/llama/Llama.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 6e44a4344..7b7c5e706 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -713,7 +713,7 @@ void Llama::forward(std::unordered_map* output_ten // max_input_length, // max_prefix_prompt_length, // stream_); - sync_check_cuda_error(); + // sync_check_cuda_error(); std::unordered_map decoder_input_tensors{ {"decoder_input", @@ -837,15 +837,15 @@ void Llama::forward(std::unordered_map* output_ten sync_check_cuda_error(); } - invokeMaskPaddingTokens(masked_tokens_, - input_tensors->at("input_lengths").getPtr(), // not_tiled - tiled_prompt_lengths_buf_, - max_cache_seq_len, - max_input_length + max_prefix_prompt_length, - 0, - batch_size, - beam_width, - stream_); + // invokeMaskPaddingTokens(masked_tokens_, + // input_tensors->at("input_lengths").getPtr(), // not_tiled + // tiled_prompt_lengths_buf_, + // max_cache_seq_len, + // max_input_length + max_prefix_prompt_length, + // 0, + // batch_size, + // beam_width, + // stream_); for (int step = max_input_length; step < (int)max_output_seq_len; step++) { const int src_indir_idx = (step - max_input_length) % 2; From f330f2e88f632dc5fa7e495dcfb542828514f0bd Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:45:12 -0700 Subject: [PATCH 72/79] commit --- src/fastertransformer/models/llama/Llama.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 7b7c5e706..aa4e14abd 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -837,15 +837,15 @@ void Llama::forward(std::unordered_map* output_ten sync_check_cuda_error(); } - // invokeMaskPaddingTokens(masked_tokens_, - // input_tensors->at("input_lengths").getPtr(), // not_tiled - // tiled_prompt_lengths_buf_, - // max_cache_seq_len, - // max_input_length + max_prefix_prompt_length, - // 0, - // batch_size, - // beam_width, - // stream_); + invokeMaskPaddingTokens(masked_tokens_, + input_tensors->at("input_lengths").getPtr(), // not_tiled + tiled_prompt_lengths_buf_, + max_cache_seq_len, + max_input_length + max_prefix_prompt_length, + 0, + batch_size, + beam_width, + stream_); for (int step = max_input_length; step < (int)max_output_seq_len; step++) { const int src_indir_idx = (step - max_input_length) % 2; From 3ef5d241313e5c75aa2040bd95971b7ee1c6c7cb Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:46:09 -0700 Subject: [PATCH 73/79] commit --- src/fastertransformer/models/llama/Llama.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index aa4e14abd..debe3dbb1 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -706,14 +706,14 @@ void Llama::forward(std::unordered_map* output_ten sync_check_cuda_error(); } - // invokeBuildDecoderAttentionMask(input_attention_mask_, - // tiled_input_lengths_buf_, - // tiled_prompt_lengths_buf_, - // batch_size * beam_width, - // max_input_length, - // max_prefix_prompt_length, - // stream_); - // sync_check_cuda_error(); + invokeBuildDecoderAttentionMask(input_attention_mask_, + tiled_input_lengths_buf_, + tiled_prompt_lengths_buf_, + batch_size * beam_width, + max_input_length, + max_prefix_prompt_length, + stream_); + sync_check_cuda_error(); std::unordered_map decoder_input_tensors{ {"decoder_input", From 8e57eb5e270ecaefb39ad97f67421083d03e3aa1 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:50:06 -0700 Subject: [PATCH 74/79] commit --- src/fastertransformer/models/llama/Llama.cc | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index debe3dbb1..172170c75 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -640,7 +640,6 @@ void Llama::forward(std::unordered_map* output_ten } // Prefix prompts - printf("has_prefix_prompt_: %d\n", has_prefix_prompt_); if (has_prefix_prompt_) { cudaMemcpyAsync(prompt_learning_weight_batch_, prefix_prompt_weight_batch_ptrs.data(), @@ -837,6 +836,21 @@ void Llama::forward(std::unordered_map* output_ten sync_check_cuda_error(); } + { + + int* buf; + int st = batch_size * beam_width; + buf = new int[st]; + cudaMemcpy(buf, tiled_prompt_lengths_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost); + { + printf("tiled_prompt_lengths_buf_:\n"); + for (int i=0; iat("input_lengths").getPtr(), // not_tiled tiled_prompt_lengths_buf_, From 3e502435b3451ecbd76d7ebebe05bbfa89c60d71 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:52:12 -0700 Subject: [PATCH 75/79] commit --- src/fastertransformer/models/llama/Llama.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 172170c75..fb6045104 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -853,7 +853,6 @@ void Llama::forward(std::unordered_map* output_ten } invokeMaskPaddingTokens(masked_tokens_, input_tensors->at("input_lengths").getPtr(), // not_tiled - tiled_prompt_lengths_buf_, max_cache_seq_len, max_input_length + max_prefix_prompt_length, 0, From 87cfd581fb2d30b0bc05ea48da9780f5b3a4a42f Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:55:44 -0700 Subject: [PATCH 76/79] commit --- src/fastertransformer/models/llama/Llama.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index fb6045104..1cc96b073 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -704,6 +704,7 @@ void Llama::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); } + printf("invokeBuildDecoderAttentionMask\n"); invokeBuildDecoderAttentionMask(input_attention_mask_, tiled_input_lengths_buf_, From 09b5f4501ce6334ad72fda4895867e699d081364 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:58:55 -0700 Subject: [PATCH 77/79] commit --- src/fastertransformer/models/llama/Llama.cc | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 1cc96b073..eabb0f217 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -704,11 +704,10 @@ void Llama::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); } - printf("invokeBuildDecoderAttentionMask\n"); invokeBuildDecoderAttentionMask(input_attention_mask_, tiled_input_lengths_buf_, - tiled_prompt_lengths_buf_, + nullptr, // prefix_prompt_lengths batch_size * beam_width, max_input_length, max_prefix_prompt_length, @@ -837,21 +836,6 @@ void Llama::forward(std::unordered_map* output_ten sync_check_cuda_error(); } - { - - int* buf; - int st = batch_size * beam_width; - buf = new int[st]; - cudaMemcpy(buf, tiled_prompt_lengths_buf_, sizeof(int) * st, cudaMemcpyDeviceToHost); - { - printf("tiled_prompt_lengths_buf_:\n"); - for (int i=0; iat("input_lengths").getPtr(), // not_tiled max_cache_seq_len, From 407a8684eae2d84599457120cafae12328bfce36 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 22:59:37 -0700 Subject: [PATCH 78/79] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index eabb0f217..ad390d551 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -707,7 +707,7 @@ void Llama::forward(std::unordered_map* output_ten invokeBuildDecoderAttentionMask(input_attention_mask_, tiled_input_lengths_buf_, - nullptr, // prefix_prompt_lengths + (const int*)nullptr, // prefix_prompt_lengths batch_size * beam_width, max_input_length, max_prefix_prompt_length, From 2d7be1a88e4b1a63917f0a14c6b73125df1c2e80 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 30 Oct 2023 23:02:46 -0700 Subject: [PATCH 79/79] commit --- src/fastertransformer/models/llama/Llama.cc | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 64a7cf5f0..01ebc0e48 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -140,8 +140,6 @@ void Llama::allocateBuffer( // prompt_learning weight batch ptrs prompt_learning_weight_batch_ = (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); - tiled_prompt_lengths_buf_ = - (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, true)); tiled_input_ids_buf_ = (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); @@ -204,7 +202,6 @@ void Llama::freeBuffer() } allocator_->free((void**)(&prompt_learning_weight_batch_)); - allocator_->free((void**)(&tiled_prompt_lengths_buf_)); allocator_->free((void**)(&tiled_input_ids_buf_)); allocator_->free((void**)(&tiled_input_lengths_buf_)); @@ -639,22 +636,6 @@ void Llama::forward(std::unordered_map* output_ten sync_check_cuda_error(); } - // Prefix prompts - if (has_prefix_prompt_) { - cudaMemcpyAsync(prompt_learning_weight_batch_, - prefix_prompt_weight_batch_ptrs.data(), - sizeof(T*) * batch_size * beam_width, - cudaMemcpyDefault, - stream_); - cudaMemcpyAsync(tiled_prompt_lengths_buf_, - prefix_prompt_lengths.data(), - sizeof(int) * batch_size * beam_width, - cudaMemcpyDefault, - stream_); - } - - sync_check_cuda_error(); - // handle first step if (has_prefix_prompt_ || has_prefix_soft_prompt_ || max_input_length > 1) { invokeTileGptInputs(tiled_input_ids_buf_,