Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
60a28cf
commit
sfc-gh-zhwang Aug 10, 2023
04e2ad3
commit
sfc-gh-zhwang Aug 10, 2023
373db6e
commit
sfc-gh-zhwang Aug 10, 2023
eb02b4f
commit
sfc-gh-zhwang Aug 10, 2023
081fc1d
commit
sfc-gh-zhwang Aug 10, 2023
7f51d1c
commit
sfc-gh-zhwang Aug 10, 2023
421c8f9
commit
sfc-gh-zhwang Aug 10, 2023
7adf42e
commit
sfc-gh-zhwang Aug 10, 2023
0e22eba
commit
sfc-gh-zhwang Aug 10, 2023
fad7a56
commit
sfc-gh-zhwang Aug 10, 2023
87609f8
commit
sfc-gh-zhwang Aug 10, 2023
dbf67f7
commit
sfc-gh-zhwang Aug 11, 2023
9049fc2
commit
sfc-gh-zhwang Aug 11, 2023
fcb4af9
commit
sfc-gh-zhwang Aug 11, 2023
7f40dfb
commit
sfc-gh-zhwang Aug 11, 2023
791ae39
commit
sfc-gh-zhwang Aug 11, 2023
35a83c3
commit
sfc-gh-zhwang Aug 11, 2023
a6291dc
commit
sfc-gh-zhwang Aug 11, 2023
28f4b38
commit
sfc-gh-zhwang Aug 11, 2023
78d55bb
commit
sfc-gh-zhwang Aug 12, 2023
184d3c7
commit
sfc-gh-zhwang Aug 13, 2023
776b431
commit
sfc-gh-zhwang Aug 14, 2023
8633a09
commit
sfc-gh-zhwang Aug 14, 2023
f405938
commit
sfc-gh-zhwang Aug 14, 2023
3b80acf
commit
sfc-gh-zhwang Aug 14, 2023
a00767f
commit
sfc-gh-zhwang Aug 14, 2023
3b5beca
commit
sfc-gh-zhwang Aug 14, 2023
404bd9e
commit
sfc-gh-zhwang Aug 14, 2023
ffbd8c2
commit
sfc-gh-zhwang Aug 14, 2023
53e3c50
commit
sfc-gh-zhwang Aug 14, 2023
b39129b
commit
sfc-gh-zhwang Aug 14, 2023
06b27b7
commit
sfc-gh-zhwang Aug 14, 2023
9ddcbad
commit
sfc-gh-zhwang Aug 14, 2023
e4beb12
commit
sfc-gh-zhwang Aug 14, 2023
ab887b7
commit
sfc-gh-zhwang Aug 15, 2023
50af94f
commit
sfc-gh-zhwang Aug 15, 2023
8f5441f
commit
sfc-gh-zhwang Aug 15, 2023
a734b27
commit
sfc-gh-zhwang Aug 15, 2023
0da489e
commit
sfc-gh-zhwang Aug 15, 2023
35ef772
commit
sfc-gh-zhwang Aug 15, 2023
3fe7146
commit
sfc-gh-zhwang Aug 15, 2023
98ac11a
commit
sfc-gh-zhwang Aug 15, 2023
9d9c7ec
commit
sfc-gh-zhwang Aug 15, 2023
8cef8d7
commit
sfc-gh-zhwang Aug 15, 2023
65606d3
commit
sfc-gh-zhwang Aug 15, 2023
566067d
commit
sfc-gh-zhwang Aug 15, 2023
722f6a5
commit
sfc-gh-zhwang Aug 15, 2023
23fb5ec
commit
sfc-gh-zhwang Aug 15, 2023
82cddaa
commit
sfc-gh-zhwang Aug 15, 2023
cc18192
commit
sfc-gh-zhwang Aug 15, 2023
bacdc3d
commit
sfc-gh-zhwang Aug 15, 2023
1cd3b51
commit
sfc-gh-zhwang Aug 15, 2023
bddb5d2
commit
sfc-gh-zhwang Aug 16, 2023
e7fcd09
commit
sfc-gh-zhwang Aug 16, 2023
faee7c8
commit
sfc-gh-zhwang Aug 16, 2023
0a7bf2d
commit
sfc-gh-zhwang Aug 16, 2023
0a6b27a
commit
sfc-gh-zhwang Aug 16, 2023
1ff58c6
commit
sfc-gh-zhwang Aug 16, 2023
e127037
commit
sfc-gh-zhwang Aug 16, 2023
b7b186f
commit
sfc-gh-zhwang Aug 16, 2023
6254f0c
commit
sfc-gh-zhwang Aug 16, 2023
0272635
commit
sfc-gh-zhwang Aug 17, 2023
bfaa93c
commit
sfc-gh-zhwang Aug 17, 2023
c14c7e6
commit
sfc-gh-zhwang Aug 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,33 @@
"unordered_set": "cpp",
"future": "cpp",
"cfenv": "cpp",
"typeindex": "cpp"
"typeindex": "cpp",
"__bit_reference": "cpp",
"__bits": "cpp",
"__config": "cpp",
"__debug": "cpp",
"__errc": "cpp",
"__hash_table": "cpp",
"__locale": "cpp",
"__mutex_base": "cpp",
"__node_handle": "cpp",
"__split_buffer": "cpp",
"__threading_support": "cpp",
"__tree": "cpp",
"__tuple": "cpp",
"__verbose_abort": "cpp",
"bit": "cpp",
"ios": "cpp",
"locale": "cpp",
"queue": "cpp",
"stack": "cpp",
"variant": "cpp",
"__nullptr": "cpp",
"__string": "cpp",
"compare": "cpp",
"concepts": "cpp",
"filesystem": "cpp",
"__memory": "cpp",
"version": "cpp"
}
}
}
19 changes: 19 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,21 @@ project(FasterTransformer LANGUAGES CXX CUDA)

find_package(CUDA 10.2 REQUIRED)

include(FetchContent)

FetchContent_Declare(
repo-cutlass
GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
GIT_TAG cc85b64cf676c45f98a17e3a47c0aafcf817f088
)

set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

FetchContent_MakeAvailable(repo-cutlass)

set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)

if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
add_definitions("-DENABLE_BF16")
message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag")
Expand Down Expand Up @@ -346,6 +361,9 @@ add_library(transformer-shared SHARED
$<TARGET_OBJECTS:GptNeoXTritonBackend>
$<TARGET_OBJECTS:GptNeoXWeight>
$<TARGET_OBJECTS:LinearAdapterLayer>
$<TARGET_OBJECTS:llama_fmha>
$<TARGET_OBJECTS:Llama>
$<TARGET_OBJECTS:LlamaTritonBackend>
$<TARGET_OBJECTS:OnlineBeamSearchLayer>
$<TARGET_OBJECTS:ParallelGpt>
$<TARGET_OBJECTS:ParallelGptContextDecoder>
Expand Down Expand Up @@ -466,6 +484,7 @@ set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON
set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand)
target_link_libraries(transformer-shared PUBLIC stdc++fs)

include(GNUInstallDirs)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/FasterTransformer)
Expand Down
15 changes: 15 additions & 0 deletions src/fastertransformer/kernels/decoder_masked_multihead_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ struct Multihead_attention_params_base {
const float* qkv_scale_out = nullptr;
const float* attention_out_scale = nullptr;
int int8_mode = 0;

float attention_k_scale = 0.f;
float attention_v_scale = 0.f;
};

template<typename T, bool CROSS_ATTENTION>
Expand All @@ -135,6 +138,12 @@ struct Multihead_attention_params: public Multihead_attention_params_base<T> {

// required in case of masked attention with different length
const int* length_per_sample = nullptr;

T** k_cache_per_sample = nullptr;
T** v_cache_per_sample = nullptr;
size_t kv_cache_per_sample_offset = 0;
bool k_cache_interleaved = true;
int num_kv_heads = 0;
};

template<typename T>
Expand All @@ -152,6 +161,12 @@ struct Multihead_attention_params<T, true>: public Multihead_attention_params_ba

// required in case of masked attention with different length
const int* length_per_sample = nullptr;

T** k_cache_per_sample = nullptr;
T** v_cache_per_sample = nullptr;
size_t kv_cache_per_sample_offset = 0;
bool k_cache_interleaved = true;
int num_kv_heads = 0;
};

template<class T>
Expand Down
55 changes: 55 additions & 0 deletions src/fastertransformer/kernels/unfused_attention_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1556,6 +1556,42 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf,
}
}

template<typename T>
void invokeAddFusedQKVBiasTranspose(T* q_buf,
T* k_buf,
T* v_buf,
PrefixPromptBatchWeightsParam<T> param,
T* QKV,
const T* qkv_bias,
const int* padding_offset,
const int* history_length,
const int batch_size,
const int seq_len,
const int token_num,
const int head_num,
const int kv_head_num,
const int size_per_head,
const int rotary_embedding_dim,
const int neox_rotary_style,
const float* scale,
const int int8_mode,
cudaStream_t stream)
{
FT_CHECK(rotary_embedding_dim);
FT_CHECK_WITH_INFO(int8_mode != 2, "w8a8 not yet implemented with prefix prompt"); // TODO(mseznec)
// To implement rotary embeddings, each thread processes two QKV elems:
dim3 block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
dim3 grid(token_num + batch_size * param.max_prefix_prompt_length, head_num);
size_t smem_size = neox_rotary_style ? 2 * rotary_embedding_dim * sizeof(T) : 0;
// NOTE: add offset for rotary embedding
if (param.max_prefix_prompt_length == 0) {
FUSED_QKV_BIAS_TRANSPOSE_LAUNCH(T, false);
}
else {
FUSED_QKV_BIAS_TRANSPOSE_LAUNCH(T, true);
}
}

#define INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(T) \
template void invokeAddFusedQKVBiasTranspose(T* q_buf, \
T* k_buf, \
Expand All @@ -1573,6 +1609,25 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf,
const int neox_rotary_style, \
const float* scale, \
const int int8_mode, \
cudaStream_t stream); \
template void invokeAddFusedQKVBiasTranspose(T* q_buf, \
T* k_buf, \
T* v_buf, \
PrefixPromptBatchWeightsParam<T> param, \
T* QKV, \
const T* qkv_bias, \
const int* padding_offset, \
const int* history_length, \
const int batch_size, \
const int seq_len, \
const int token_num, \
const int head_num, \
const int kv_head_num, \
const int size_per_head, \
const int rotary_embedding_dim, \
const int neox_rotary_style, \
const float* scale, \
const int int8_mode, \
cudaStream_t stream)
INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(float);
INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(half);
Expand Down
21 changes: 21 additions & 0 deletions src/fastertransformer/kernels/unfused_attention_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,27 @@ struct PrefixPromptBatchWeightsParam {
const size_t prefix_prompt_layer_offset_per_seq = 0;
};

template<typename T>
void invokeAddFusedQKVBiasTranspose(T* q_buf,
T* k_buf,
T* v_buf,
PrefixPromptBatchWeightsParam<T> param,
T* QKV,
const T* qkv_bias,
const int* padding_offset,
const int* history_length,
const int batch_size,
const int seq_len,
const int token_num,
const int head_num,
const int kv_head_num,
const int size_per_head,
const int rotary_embedding_dim,
const int neox_rotary_style,
const float* scale,
const int int8_mode,
cudaStream_t stream);

template<typename T>
void invokeAddFusedQKVBiasTranspose(T* q_buf,
T* k_buf,
Expand Down
8 changes: 8 additions & 0 deletions src/fastertransformer/layers/DynamicDecodeLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@ class DynamicDecodeLayer: public BaseLayer {
int* h_pinned_finished_sum_ = nullptr;

public:
curandState_t* topk_curandstate_buf()
{
return static_cast<BaseSamplingLayer<T>*>(topk_decode_)->curandstate_buf();
}
curandState_t* topp_curandstate_buf()
{
return static_cast<BaseSamplingLayer<T>*>(topp_decode_)->curandstate_buf();
}
DynamicDecodeLayer(size_t vocab_size,
size_t vocab_size_padded,
int end_id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ class BaseSamplingLayer: public DynamicDecodeBaseLayer {
virtual void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p);

public:
curandState_t* curandstate_buf()
{
return curandstate_buf_;
}

BaseSamplingLayer(size_t max_batch_size,
size_t vocab_size,
size_t vocab_size_padded,
Expand Down
1 change: 1 addition & 0 deletions src/fastertransformer/models/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ add_subdirectory(bert_fp8)
endif()
add_subdirectory(deberta)
add_subdirectory(decoder)
add_subdirectory(llama)
add_subdirectory(longformer)
add_subdirectory(decoding)
add_subdirectory(xlnet)
Expand Down
37 changes: 37 additions & 0 deletions src/fastertransformer/models/llama/Barrier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

#include "src/fastertransformer/utils/logger.h"
#include <pthread.h>

namespace fastertransformer {

class Barrier {
public:
Barrier(unsigned count)
{
FT_LOG_INFO("Barrier(%d)", (int)count);
pthread_barrier_init(&barrier_, nullptr, count);
}

Barrier(const Barrier&) = delete;
Barrier& operator=(const Barrier&) = delete;
Barrier(Barrier&&) noexcept = delete;
Barrier& operator=(Barrier&&) noexcept = delete;

void wait()
{
pthread_barrier_wait(&barrier_);
}

~Barrier()
{
pthread_barrier_destroy(&barrier_);
}

private:
pthread_barrier_t barrier_{};
};

} // namespace fastertransformer
43 changes: 43 additions & 0 deletions src/fastertransformer/models/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) OpenMMLab. All rights reserved.

cmake_minimum_required(VERSION 3.8)

add_subdirectory(fused_multi_head_attention)

add_library(Llama STATIC
LlamaV2.cc
LlamaBatch.cc
LlamaCacheManager.cc
LlamaContextDecoder.cc
LlamaContextAttentionLayer.cc
LlamaDecoderSelfAttentionLayer.cc
LlamaDecoder.cc
LlamaWeight.cc
LlamaDecoderLayerWeight.cc
LlamaFfnLayer.cc
llama_kernels.cu
llama_decoder_kernels.cu
llama_utils.cu)
set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(Llama PUBLIC -lcudart
cublasMMWrapper
DynamicDecodeLayer
activation_kernels
decoder_masked_multihead_attention
bert_preprocess_kernels
decoding_kernels
unfused_attention_kernels
custom_ar_kernels
custom_ar_comm
gpt_kernels
tensor
memory_utils
nccl_utils
cuda_utils
logger
stdc++fs
llama_fmha)

add_executable(llama_gemm llama_gemm.cc)
target_link_libraries(llama_gemm PUBLIC -lcudart gpt_gemm_func memory_utils cuda_utils logger)
Loading