From 22f5a69c7a0f5bdd985661f305de1c11520bf182 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 9 Oct 2023 22:05:57 -0700 Subject: [PATCH] commit --- .../triton_backend/bert/BertTritonModel.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/fastertransformer/triton_backend/bert/BertTritonModel.cc b/src/fastertransformer/triton_backend/bert/BertTritonModel.cc index 839c2e29c..f03a9281e 100644 --- a/src/fastertransformer/triton_backend/bert/BertTritonModel.cc +++ b/src/fastertransformer/triton_backend/bert/BertTritonModel.cc @@ -124,7 +124,11 @@ BertTritonModel::createModelInstance(int const int max_seq_len = 384; ft::AttentionType attention_type = ft::getAttentionType(size_per_head_, ft::getSMVersion(), is_remove_padding_, max_seq_len); - + bool is_free_buffer_after_forward = false; + char * free_buffer_after_forward = std::getenv("BERT_FREE_BUFFER_AFTER_FORWARD"); + if (free_buffer_after_forward != nullptr && std::string(free_buffer_after_forward) == "ON") { + is_free_buffer_after_forward = true; + } auto bert = std::make_unique>(ft::Bert(0, // max_batch_size, FT will adjust the buffer automatically. 0, // max_seq_len, FT will adjust the buffer automatically. @@ -137,7 +141,7 @@ BertTritonModel::createModelInstance(int stream, cublas_wrapper.get(), allocator.get(), - false, + is_free_buffer_after_forward, attention_type, is_sparse_, activation_type_,