diff --git a/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh b/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh index feea4018e..2b6a9a867 100644 --- a/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh +++ b/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh @@ -15,6 +15,7 @@ #define CUDA_BLOCK_SIZE_4096 4096 #define CUDA_BLOCK_SIZE_1024 1024 #define CUDA_BLOCK_SIZE_512 512 +#define CUDA_BLOCK_SIZE_2048 2048 #define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess) diff --git a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu index 6dae5af61..5ef92c49f 100644 --- a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu +++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu @@ -88,6 +88,10 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len, _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len, + _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream)); } else { return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu index b083650d4..93b5d3a4f 100644 --- a/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu +++ b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu @@ -123,6 +123,8 @@ infiniStatus_t Descriptor::calculate( CHECK_STATUS(launchKernel(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { CHECK_STATUS(launchKernel(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) { + CHECK_STATUS(launchKernel(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream)); } else { return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; } diff --git a/xmake/iluvatar.lua b/xmake/iluvatar.lua index 35ccf2154..e5eb5c05f 100644 --- a/xmake/iluvatar.lua +++ b/xmake/iluvatar.lua @@ -43,7 +43,7 @@ target("infiniop-iluvatar") set_warnings("all", "error") add_cuflags("-Wno-error=unused-private-field") - add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true}) + add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", "--cuda-gpu-arch=ivcore20", {force = true}) add_culdflags("-fPIC") add_cxflags("-fPIC")