From fdd1001fbc9141f617157e62e86512919ccac348 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 09:47:35 -0800 Subject: [PATCH 01/24] print some --- src/fastertransformer/layers/DynamicDecodeLayer.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/fastertransformer/layers/DynamicDecodeLayer.cc b/src/fastertransformer/layers/DynamicDecodeLayer.cc index 206bb2621..b81687653 100644 --- a/src/fastertransformer/layers/DynamicDecodeLayer.cc +++ b/src/fastertransformer/layers/DynamicDecodeLayer.cc @@ -448,6 +448,8 @@ void DynamicDecodeLayer::forward(TensorMap* output_tensors, TensorMap* input_ const size_t id_offset = ite * local_batch_size * beam_width; const size_t stop_words_length = input_tensors->at("stop_words_list").shape[2]; + printf("Tensor stop words list is: %d, %d, %d", input_tensors->at("stop_words_list").getVal(0), input_tensors->at("stop_words_list").getVal(1), input_tensors->at("stop_words_list").getVal(2)); + invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr(), output_tensors->at("parent_ids").getPtr(), input_tensors->at("stop_words_list") From 82198db63d82fea057f178605642a686c4003dd2 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 09:55:23 -0800 Subject: [PATCH 02/24] f --- src/fastertransformer/layers/DynamicDecodeLayer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/layers/DynamicDecodeLayer.cc b/src/fastertransformer/layers/DynamicDecodeLayer.cc index b81687653..da0745b23 100644 --- a/src/fastertransformer/layers/DynamicDecodeLayer.cc +++ b/src/fastertransformer/layers/DynamicDecodeLayer.cc @@ -448,7 +448,7 @@ void DynamicDecodeLayer::forward(TensorMap* output_tensors, TensorMap* input_ const size_t id_offset = ite * local_batch_size * beam_width; const size_t stop_words_length = input_tensors->at("stop_words_list").shape[2]; - printf("Tensor stop words list is: %d, %d, %d", input_tensors->at("stop_words_list").getVal(0), input_tensors->at("stop_words_list").getVal(1), input_tensors->at("stop_words_list").getVal(2)); + printf("Tensor stop words list is: %d", input_tensors->at("stop_words_list")); invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr(), output_tensors->at("parent_ids").getPtr(), From 0c9e93ffd823dc5184578fe50e067bb683b929ad Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 09:58:53 -0800 Subject: [PATCH 03/24] update --- src/fastertransformer/layers/DynamicDecodeLayer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/layers/DynamicDecodeLayer.cc b/src/fastertransformer/layers/DynamicDecodeLayer.cc index da0745b23..52ddcbfef 100644 --- a/src/fastertransformer/layers/DynamicDecodeLayer.cc +++ b/src/fastertransformer/layers/DynamicDecodeLayer.cc @@ -448,7 +448,7 @@ void DynamicDecodeLayer::forward(TensorMap* output_tensors, TensorMap* input_ const size_t id_offset = ite * local_batch_size * beam_width; const size_t stop_words_length = input_tensors->at("stop_words_list").shape[2]; - printf("Tensor stop words list is: %d", input_tensors->at("stop_words_list")); + printf("Tensor stop words list is size: %d", stop_words_length); invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr(), output_tensors->at("parent_ids").getPtr(), From ca9679f47290a847146ea901089e28470999af18 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 10:09:11 -0800 Subject: [PATCH 04/24] w --- src/fastertransformer/layers/DynamicDecodeLayer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/layers/DynamicDecodeLayer.cc b/src/fastertransformer/layers/DynamicDecodeLayer.cc index 52ddcbfef..d91cafc03 100644 --- a/src/fastertransformer/layers/DynamicDecodeLayer.cc +++ b/src/fastertransformer/layers/DynamicDecodeLayer.cc @@ -448,7 +448,7 @@ void DynamicDecodeLayer::forward(TensorMap* output_tensors, TensorMap* input_ const size_t id_offset = ite * local_batch_size * beam_width; const size_t stop_words_length = input_tensors->at("stop_words_list").shape[2]; - printf("Tensor stop words list is size: %d", stop_words_length); + printf("Tensor stop words list is size: %d", input_tensors->at("stop_words_list").shape[1]); invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr(), output_tensors->at("parent_ids").getPtr(), From 2df390a284be12015ef7b97b1da49de2b6e68d5c Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 10:10:46 -0800 Subject: [PATCH 05/24] f --- src/fastertransformer/layers/DynamicDecodeLayer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/layers/DynamicDecodeLayer.cc b/src/fastertransformer/layers/DynamicDecodeLayer.cc index d91cafc03..ba3412ef7 100644 --- a/src/fastertransformer/layers/DynamicDecodeLayer.cc +++ b/src/fastertransformer/layers/DynamicDecodeLayer.cc @@ -448,7 +448,7 @@ void DynamicDecodeLayer::forward(TensorMap* output_tensors, TensorMap* input_ const size_t id_offset = ite * local_batch_size * beam_width; const size_t stop_words_length = input_tensors->at("stop_words_list").shape[2]; - printf("Tensor stop words list is size: %d", input_tensors->at("stop_words_list").shape[1]); + printf("Tensor stop words list is size: %d", input_tensors->at("stop_words_list").shape[3]); invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr(), output_tensors->at("parent_ids").getPtr(), From 227dc8991b6283fc20ac11e87eef118e9d537539 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 10:16:14 -0800 Subject: [PATCH 06/24] df --- src/fastertransformer/layers/DynamicDecodeLayer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/layers/DynamicDecodeLayer.cc b/src/fastertransformer/layers/DynamicDecodeLayer.cc index ba3412ef7..eccdb0257 100644 --- a/src/fastertransformer/layers/DynamicDecodeLayer.cc +++ b/src/fastertransformer/layers/DynamicDecodeLayer.cc @@ -448,7 +448,7 @@ void DynamicDecodeLayer::forward(TensorMap* output_tensors, TensorMap* input_ const size_t id_offset = ite * local_batch_size * beam_width; const size_t stop_words_length = input_tensors->at("stop_words_list").shape[2]; - printf("Tensor stop words list is size: %d", input_tensors->at("stop_words_list").shape[3]); + printf("Tensor stop words list is: %d, %d", input_tensors->at("stop_words_list")[0], input_tensors->at("stop_words_list")[1]); invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr(), output_tensors->at("parent_ids").getPtr(), From 848fdecece0a7ac3c1059c578732fa2a34f7e03b Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 10:58:19 -0800 Subject: [PATCH 07/24] d --- examples/cpp/llama/stop_words.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/stop_words.csv b/examples/cpp/llama/stop_words.csv index 919477c4d..f12836136 100644 --- a/examples/cpp/llama/stop_words.csv +++ b/examples/cpp/llama/stop_words.csv @@ -1 +1 @@ -29961, 25580, 29962 \ No newline at end of file +29961,25580,29962 \ No newline at end of file From c7c474233e82ac0d040af51a3814b70f7154fd87 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 11:33:21 -0800 Subject: [PATCH 08/24] f --- src/fastertransformer/layers/DynamicDecodeLayer.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/fastertransformer/layers/DynamicDecodeLayer.cc b/src/fastertransformer/layers/DynamicDecodeLayer.cc index eccdb0257..206bb2621 100644 --- a/src/fastertransformer/layers/DynamicDecodeLayer.cc +++ b/src/fastertransformer/layers/DynamicDecodeLayer.cc @@ -448,8 +448,6 @@ void DynamicDecodeLayer::forward(TensorMap* output_tensors, TensorMap* input_ const size_t id_offset = ite * local_batch_size * beam_width; const size_t stop_words_length = input_tensors->at("stop_words_list").shape[2]; - printf("Tensor stop words list is: %d, %d", input_tensors->at("stop_words_list")[0], input_tensors->at("stop_words_list")[1]); - invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr(), output_tensors->at("parent_ids").getPtr(), input_tensors->at("stop_words_list") From b824d63aea2e95d45d91836aa6ac3c271d4efe5c Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 11:46:13 -0800 Subject: [PATCH 09/24] f --- src/fastertransformer/kernels/stop_criteria_kernels.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/fastertransformer/kernels/stop_criteria_kernels.cu b/src/fastertransformer/kernels/stop_criteria_kernels.cu index a8d4b98fa..aa2889139 100644 --- a/src/fastertransformer/kernels/stop_criteria_kernels.cu +++ b/src/fastertransformer/kernels/stop_criteria_kernels.cu @@ -49,6 +49,14 @@ __global__ void stop_words_criterion(const int* output_ids, /* The single-token case unconditionally bans the token */ bool should_stop = false; + printf("------\n") + printf("step %d \n", step) + printf("item size %d \n", item_size) + printf("item_start %d \n", item_start) + printf("item_end %d \n", item_end) + printf("base_stop_words %d \n", *base_stop_words) + printf("base offsets %d \n", *base_offsets) + /* Enough previously generated tokens to look for a match */ if (step + 1 >= item_size) { should_stop = true; From 7475a19b24fd5446ca33724c84fb4313c07f0ecf Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 11:46:44 -0800 Subject: [PATCH 10/24] d --- .../kernels/stop_criteria_kernels.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/fastertransformer/kernels/stop_criteria_kernels.cu b/src/fastertransformer/kernels/stop_criteria_kernels.cu index aa2889139..24501165c 100644 --- a/src/fastertransformer/kernels/stop_criteria_kernels.cu +++ b/src/fastertransformer/kernels/stop_criteria_kernels.cu @@ -49,13 +49,13 @@ __global__ void stop_words_criterion(const int* output_ids, /* The single-token case unconditionally bans the token */ bool should_stop = false; - printf("------\n") - printf("step %d \n", step) - printf("item size %d \n", item_size) - printf("item_start %d \n", item_start) - printf("item_end %d \n", item_end) - printf("base_stop_words %d \n", *base_stop_words) - printf("base offsets %d \n", *base_offsets) + printf("------\n"); + printf("step %d \n", step); + printf("item size %d \n", item_size); + printf("item_start %d \n", item_start); + printf("item_end %d \n", item_end); + printf("base_stop_words %d \n", *base_stop_words); + printf("base offsets %d \n", *base_offsets); /* Enough previously generated tokens to look for a match */ if (step + 1 >= item_size) { From e7f50a546a610394508ec936800f3c4c70ffe1c4 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 15:26:34 -0800 Subject: [PATCH 11/24] print --- examples/cpp/llama/llama_triton_example.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index b56e9997e..4e4a8c21a 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -68,6 +68,12 @@ broadCastRequest(const std::vector& v_start_ids, v_tiled_stop_words.insert(v_tiled_stop_words.end(), v_stop_words.begin(), v_stop_words.end()); } + printf("Contents of v_tiled_stop_words:\n"); + for (size_t i = 0; i < v_tiled_stop_words.size(); i++) { + printf("%d ", v_tiled_stop_words[i]); + } + printf("\n"); + ft::mpi::bcast(&size_1, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); ft::mpi::bcast(&size_2, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); ft::mpi::bcast(&size_bad_words, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); @@ -137,6 +143,16 @@ broadCastRequest(const std::vector& v_start_ids, pointer_record->push_back(start_ids_ptr); pointer_record->push_back(end_ids_ptr); + printf("Stop words len %d", stop_words_len) + printf("Size v_input_stop_words %d", d_input_stop_words.size()) + + printf("Contents of d_input_stop_words:\n"); + for (size_t i = 0; i < d_input_stop_words.size(); i++) { + printf("%d ", d_input_stop_words[i]); + } + printf("\n"); + + request_list.push_back(std::shared_ptr>( new std::unordered_map{ {"input_ids", From 72175ae3a754892e96317447f534bf203ef72716 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 15:27:04 -0800 Subject: [PATCH 12/24] d --- examples/cpp/llama/llama_triton_example.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 4e4a8c21a..f4a6a9d2b 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -143,8 +143,8 @@ broadCastRequest(const std::vector& v_start_ids, pointer_record->push_back(start_ids_ptr); pointer_record->push_back(end_ids_ptr); - printf("Stop words len %d", stop_words_len) - printf("Size v_input_stop_words %d", d_input_stop_words.size()) + printf("Stop words len %d", stop_words_len); + printf("Size v_input_stop_words %d", d_input_stop_words.size()); printf("Contents of d_input_stop_words:\n"); for (size_t i = 0; i < d_input_stop_words.size(); i++) { From fc2826bd6413b9084b99a61c3eaa40e784a75032 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 15:28:16 -0800 Subject: [PATCH 13/24] d --- examples/cpp/llama/llama_triton_example.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index f4a6a9d2b..6ab38232b 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -144,11 +144,11 @@ broadCastRequest(const std::vector& v_start_ids, pointer_record->push_back(end_ids_ptr); printf("Stop words len %d", stop_words_len); - printf("Size v_input_stop_words %d", d_input_stop_words.size()); + printf("Size v_input_stop_words %d", v_input_stop_words.size()); printf("Contents of d_input_stop_words:\n"); for (size_t i = 0; i < d_input_stop_words.size(); i++) { - printf("%d ", d_input_stop_words[i]); + printf("%d ", *d_input_stop_words[i]); } printf("\n"); From 1c80bb1341023e2ea33562c098fa40c7e0786dec Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 15:29:08 -0800 Subject: [PATCH 14/24] d --- examples/cpp/llama/llama_triton_example.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 6ab38232b..40e444c38 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -147,8 +147,8 @@ broadCastRequest(const std::vector& v_start_ids, printf("Size v_input_stop_words %d", v_input_stop_words.size()); printf("Contents of d_input_stop_words:\n"); - for (size_t i = 0; i < d_input_stop_words.size(); i++) { - printf("%d ", *d_input_stop_words[i]); + for (size_t i = 0; i < v_input_stop_words.size(); i++) { + printf("%d ", d_input_stop_words[i]); } printf("\n"); From dc006973853d6ba7bdaaf4f5de7702a712560c9b Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 15:51:06 -0800 Subject: [PATCH 15/24] s --- examples/cpp/llama/llama_triton_example.cc | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 40e444c38..56f09737c 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -59,6 +59,7 @@ broadCastRequest(const std::vector& v_start_ids, int size_1 = v_start_ids.size(); int size_2 = v_start_lengths.size(); int size_bad_words = v_bad_words.size(); + printf("v_stop_words_size %d \n", v_stop_words.size()) int size_stop_words = v_stop_words.size() * size_2; int stop_words_len = v_stop_words.size() / 2; @@ -146,13 +147,6 @@ broadCastRequest(const std::vector& v_start_ids, printf("Stop words len %d", stop_words_len); printf("Size v_input_stop_words %d", v_input_stop_words.size()); - printf("Contents of d_input_stop_words:\n"); - for (size_t i = 0; i < v_input_stop_words.size(); i++) { - printf("%d ", d_input_stop_words[i]); - } - printf("\n"); - - request_list.push_back(std::shared_ptr>( new std::unordered_map{ {"input_ids", From 5b9552c120f17f55e97a5a9b7916f5aca0cafa79 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 15:52:05 -0800 Subject: [PATCH 16/24] s --- examples/cpp/llama/llama_triton_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 56f09737c..51042d797 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -59,7 +59,7 @@ broadCastRequest(const std::vector& v_start_ids, int size_1 = v_start_ids.size(); int size_2 = v_start_lengths.size(); int size_bad_words = v_bad_words.size(); - printf("v_stop_words_size %d \n", v_stop_words.size()) + printf("v_stop_words_size %d \n", v_stop_words.size()); int size_stop_words = v_stop_words.size() * size_2; int stop_words_len = v_stop_words.size() / 2; From 4e757ec91b2e4c0ce45bf80960f66d8e329d256e Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 15:54:31 -0800 Subject: [PATCH 17/24] d --- examples/cpp/llama/stop_words.csv | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/stop_words.csv b/examples/cpp/llama/stop_words.csv index f12836136..0a51c6210 100644 --- a/examples/cpp/llama/stop_words.csv +++ b/examples/cpp/llama/stop_words.csv @@ -1 +1,2 @@ -29961,25580,29962 \ No newline at end of file +29961,25580,29962 +-1,-1,-1 \ No newline at end of file From 33eac061f88224c8fdbc1f3ea1a28dbcd2e2290d Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 16:30:55 -0800 Subject: [PATCH 18/24] d --- examples/cpp/llama/llama_triton_example.cc | 2 +- examples/cpp/llama/stop_words.csv | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 51042d797..f660b00af 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -61,7 +61,7 @@ broadCastRequest(const std::vector& v_start_ids, int size_bad_words = v_bad_words.size(); printf("v_stop_words_size %d \n", v_stop_words.size()); int size_stop_words = v_stop_words.size() * size_2; - int stop_words_len = v_stop_words.size() / 2; + int stop_words_len = v_stop_words.size(); // Tile with same dict for each element std::vector v_tiled_stop_words; diff --git a/examples/cpp/llama/stop_words.csv b/examples/cpp/llama/stop_words.csv index 0a51c6210..f12836136 100644 --- a/examples/cpp/llama/stop_words.csv +++ b/examples/cpp/llama/stop_words.csv @@ -1,2 +1 @@ -29961,25580,29962 --1,-1,-1 \ No newline at end of file +29961,25580,29962 \ No newline at end of file From c013067c86465d3e30eb70db5f031ef7c6cb5dba Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 16:45:00 -0800 Subject: [PATCH 19/24] log --- .../kernels/stop_criteria_kernels.cu | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/fastertransformer/kernels/stop_criteria_kernels.cu b/src/fastertransformer/kernels/stop_criteria_kernels.cu index 24501165c..7d5477612 100644 --- a/src/fastertransformer/kernels/stop_criteria_kernels.cu +++ b/src/fastertransformer/kernels/stop_criteria_kernels.cu @@ -50,20 +50,26 @@ __global__ void stop_words_criterion(const int* output_ids, bool should_stop = false; printf("------\n"); - printf("step %d \n", step); + printf("id %d", id); + printf("batch idx %d", batch_idx); + printf("beam idx %d", beam_idx); + printf("base_stop_words %d \n", *base_stop_words); + printf("base offsets %d \n", *base_offsets); printf("item size %d \n", item_size); printf("item_start %d \n", item_start); printf("item_end %d \n", item_end); - printf("base_stop_words %d \n", *base_stop_words); - printf("base offsets %d \n", *base_offsets); + printf("step %d \n", step); + printf("id offset %d \n", id_offset); /* Enough previously generated tokens to look for a match */ if (step + 1 >= item_size) { + printf("in matching"); should_stop = true; int parent_id = beam_idx; const bool gather_beam = beam_width > 1; for (int token_idx = item_size - 1; token_idx >= 0; token_idx--) { + printf("in loop"); const int previous_token = output_ids[(step - (item_size - 1) + token_idx) * batch_size * beam_width + id_offset + batch_idx * beam_width + parent_id]; From 58c0160839ff8e056efad386fbfb58247310696c Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 17:31:25 -0800 Subject: [PATCH 20/24] try --- examples/cpp/llama/llama_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index c585b2aca..f61d8229e 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -9,7 +9,7 @@ model_name=llama model_dir=/notebooks/models/llama-2-7b-32k-instruct-tp1_llama_decoder/1/1-gpu [request] -beam_width=0 # beam width for beam search +beam_width=1 # beam width for beam search top_k=0 ; k value for top k sampling top_p=0.0 ; p value for top p sampling temperature=0 ; Use for sampling From da6f456c2dfac40068b50ddec8850a95581c1079 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 17:52:54 -0800 Subject: [PATCH 21/24] j --- examples/cpp/llama/llama_triton_example.cc | 2 +- examples/cpp/llama/stop_words.csv | 3 ++- src/fastertransformer/kernels/stop_criteria_kernels.cu | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index f660b00af..51042d797 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -61,7 +61,7 @@ broadCastRequest(const std::vector& v_start_ids, int size_bad_words = v_bad_words.size(); printf("v_stop_words_size %d \n", v_stop_words.size()); int size_stop_words = v_stop_words.size() * size_2; - int stop_words_len = v_stop_words.size(); + int stop_words_len = v_stop_words.size() / 2; // Tile with same dict for each element std::vector v_tiled_stop_words; diff --git a/examples/cpp/llama/stop_words.csv b/examples/cpp/llama/stop_words.csv index f12836136..9e7ac5d01 100644 --- a/examples/cpp/llama/stop_words.csv +++ b/examples/cpp/llama/stop_words.csv @@ -1 +1,2 @@ -29961,25580,29962 \ No newline at end of file +29961,25580,29962 +3,-1,-1 \ No newline at end of file diff --git a/src/fastertransformer/kernels/stop_criteria_kernels.cu b/src/fastertransformer/kernels/stop_criteria_kernels.cu index 7d5477612..5a480d169 100644 --- a/src/fastertransformer/kernels/stop_criteria_kernels.cu +++ b/src/fastertransformer/kernels/stop_criteria_kernels.cu @@ -111,6 +111,7 @@ void invokeStopWordsCriterion(const int* output_ids, block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL); grid.x = (stop_words_len + block.x - 1) / block.x; grid.y = batch_size * beam_width; + printf("\nBeam Width in involke %d", beam_width); stop_words_criterion<<>>( output_ids, parent_ids, stop_words, finished, id_offset, stop_words_len, batch_size, beam_width, step); From 62eda805f4e784989d8329b5a63e20577cc6777d Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 17:57:11 -0800 Subject: [PATCH 22/24] d --- .../kernels/stop_criteria_kernels.cu | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/fastertransformer/kernels/stop_criteria_kernels.cu b/src/fastertransformer/kernels/stop_criteria_kernels.cu index 5a480d169..a8d4b98fa 100644 --- a/src/fastertransformer/kernels/stop_criteria_kernels.cu +++ b/src/fastertransformer/kernels/stop_criteria_kernels.cu @@ -49,27 +49,13 @@ __global__ void stop_words_criterion(const int* output_ids, /* The single-token case unconditionally bans the token */ bool should_stop = false; - printf("------\n"); - printf("id %d", id); - printf("batch idx %d", batch_idx); - printf("beam idx %d", beam_idx); - printf("base_stop_words %d \n", *base_stop_words); - printf("base offsets %d \n", *base_offsets); - printf("item size %d \n", item_size); - printf("item_start %d \n", item_start); - printf("item_end %d \n", item_end); - printf("step %d \n", step); - printf("id offset %d \n", id_offset); - /* Enough previously generated tokens to look for a match */ if (step + 1 >= item_size) { - printf("in matching"); should_stop = true; int parent_id = beam_idx; const bool gather_beam = beam_width > 1; for (int token_idx = item_size - 1; token_idx >= 0; token_idx--) { - printf("in loop"); const int previous_token = output_ids[(step - (item_size - 1) + token_idx) * batch_size * beam_width + id_offset + batch_idx * beam_width + parent_id]; @@ -111,7 +97,6 @@ void invokeStopWordsCriterion(const int* output_ids, block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL); grid.x = (stop_words_len + block.x - 1) / block.x; grid.y = batch_size * beam_width; - printf("\nBeam Width in involke %d", beam_width); stop_words_criterion<<>>( output_ids, parent_ids, stop_words, finished, id_offset, stop_words_len, batch_size, beam_width, step); From a557047e164e071f22169119dca0bdab321af5b0 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 17:59:12 -0800 Subject: [PATCH 23/24] d --- examples/cpp/llama/llama_triton_example.cc | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 51042d797..b56e9997e 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -59,7 +59,6 @@ broadCastRequest(const std::vector& v_start_ids, int size_1 = v_start_ids.size(); int size_2 = v_start_lengths.size(); int size_bad_words = v_bad_words.size(); - printf("v_stop_words_size %d \n", v_stop_words.size()); int size_stop_words = v_stop_words.size() * size_2; int stop_words_len = v_stop_words.size() / 2; @@ -69,12 +68,6 @@ broadCastRequest(const std::vector& v_start_ids, v_tiled_stop_words.insert(v_tiled_stop_words.end(), v_stop_words.begin(), v_stop_words.end()); } - printf("Contents of v_tiled_stop_words:\n"); - for (size_t i = 0; i < v_tiled_stop_words.size(); i++) { - printf("%d ", v_tiled_stop_words[i]); - } - printf("\n"); - ft::mpi::bcast(&size_1, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); ft::mpi::bcast(&size_2, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); ft::mpi::bcast(&size_bad_words, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); @@ -144,9 +137,6 @@ broadCastRequest(const std::vector& v_start_ids, pointer_record->push_back(start_ids_ptr); pointer_record->push_back(end_ids_ptr); - printf("Stop words len %d", stop_words_len); - printf("Size v_input_stop_words %d", v_input_stop_words.size()); - request_list.push_back(std::shared_ptr>( new std::unordered_map{ {"input_ids", From 7cda5f3450ce6f9289b3cdb2df2b999bf62d5f91 Mon Sep 17 00:00:00 2001 From: Yahia Bsat Date: Fri, 17 Nov 2023 18:14:44 -0800 Subject: [PATCH 24/24] no beam --- examples/cpp/llama/llama_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index f61d8229e..c585b2aca 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -9,7 +9,7 @@ model_name=llama model_dir=/notebooks/models/llama-2-7b-32k-instruct-tp1_llama_decoder/1/1-gpu [request] -beam_width=1 # beam width for beam search +beam_width=0 # beam width for beam search top_k=0 ; k value for top k sampling top_p=0.0 ; p value for top p sampling temperature=0 ; Use for sampling