From c634388f461fbde469fcd0b5b3144f5cc23e0e32 Mon Sep 17 00:00:00 2001 From: YU Qing <2961548487@qq.com> Date: Thu, 11 Dec 2025 11:06:55 +0800 Subject: [PATCH] Add files via upload --- __init__.py | 7 +++++ softmin.cpp | 6 +++++ softmin_kernel.cu | 48 ++++++++++++++++++++++++++++++++++ softshrink.cpp | 6 +++++ softshrink_kernel.cu | 31 ++++++++++++++++++++++ split.cpp | 7 +++++ split_kernel.cu | 46 +++++++++++++++++++++++++++++++++ std.cpp | 6 +++++ std_kernel.cu | 52 +++++++++++++++++++++++++++++++++++++ std_mean.cpp | 6 +++++ std_mean_kernel.cu | 61 ++++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 276 insertions(+) create mode 100644 __init__.py create mode 100644 softmin.cpp create mode 100644 softmin_kernel.cu create mode 100644 softshrink.cpp create mode 100644 softshrink_kernel.cu create mode 100644 split.cpp create mode 100644 split_kernel.cu create mode 100644 std.cpp create mode 100644 std_kernel.cu create mode 100644 std_mean.cpp create mode 100644 std_mean_kernel.cu diff --git a/__init__.py b/__init__.py new file mode 100644 index 000000000..be6ad7d88 --- /dev/null +++ b/__init__.py @@ -0,0 +1,7 @@ +# python/infinicore/ops/softshrink/__init__.py +import torch +from .._C import softshrink as _softshrink +def softshrink(input, lambda=0.5): + if input.is_cuda and input.dtype == torch.float16: + return _softshrink(input, lambda) + return torch.nn.functional.softshrink(input.float(), lambda).to(input.dtype) \ No newline at end of file diff --git a/softmin.cpp b/softmin.cpp new file mode 100644 index 000000000..85a9bd8c4 --- /dev/null +++ b/softmin.cpp @@ -0,0 +1,6 @@ +// python/infinicore/ops/softmin/softmin.cpp +#include +torch::Tensor softmin(const torch::Tensor& input); +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("softmin", &softmin); +} \ No newline at end of file diff --git a/softmin_kernel.cu b/softmin_kernel.cu new file mode 100644 index 000000000..73b40485f --- /dev/null +++ b/softmin_kernel.cu @@ -0,0 +1,48 @@ +// python/infinicore/ops/softmin/softmin_kernel.cu +#include + +__global__ void softmin_128x_kernel(const half* x, half* y, int64_t rows, int64_t cols) { + extern __shared__ half sdata[]; + int tid = threadIdx.x; + int row = blockIdx.x; + if (row >= rows) return; + x += row * cols; + y += row * cols; + + half thread_max = __float2half(-1e4f); + for (int i = tid; i < cols; i += 256) { + thread_max = __hmax(thread_max, x[i]); + } + sdata[tid] = thread_max; + __syncthreads(); + for (int s = 128; s > 0; s >>= 1) { + if (tid < s) sdata[tid] = __hmax(sdata[tid], sdata[tid + s]); + __syncthreads(); + } + half row_max = sdata[0]; + + half thread_sum = __float2half(0.0f); + for (int i = tid; i < cols; i += 256) { + half val = hexp(__hsub(x[i], row_max)); + sdata[tid] = val; + thread_sum = __hadd(thread_sum, val); + } + __syncthreads(); + for (int s = 128; s > 0; s >>= 1) { + if (tid < s) sdata[tid] = __hadd(sdata[tid], sdata[tid + s]); + __syncthreads(); + } + half row_sum = sdata[0]; + + for (int i = tid; i < cols; i += 256) { + y[i] = hdiv(hexp(__hsub(x[i], row_max)), row_sum); + } +} + +torch::Tensor softmin(const torch::Tensor& input) { + TORCH_CHECK(input.scalar_type() == torch::kFloat16); + auto out = torch::empty_like(input); + int blocks = input.size(0); + softmin_128x_kernel<<>>(input.data_ptr(), out.data_ptr(), input.size(0), input.size(1)); + return out; +} \ No newline at end of file diff --git a/softshrink.cpp b/softshrink.cpp new file mode 100644 index 000000000..a80a76bef --- /dev/null +++ b/softshrink.cpp @@ -0,0 +1,6 @@ +// python/infinicore/ops/softshrink/softshrink.cpp +#include +torch::Tensor softshrink(const torch::Tensor& input, float lambda); +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("softshrink", &softshrink, "softshrink", py::arg("input"), py::arg("lambda")=0.5f); +} \ No newline at end of file diff --git a/softshrink_kernel.cu b/softshrink_kernel.cu new file mode 100644 index 000000000..09cf5083a --- /dev/null +++ b/softshrink_kernel.cu @@ -0,0 +1,31 @@ +// python/infinicore/ops/softshrink/softshrink_kernel.cu +#include + +__global__ void softshrink_152x_kernel(const half* x, half* y, half lambda, int64_t n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + half val = x[idx]; + half zero = __float2half(0.0f); + half pos = __hadd(val, -lambda); + half neg = __hadd(val, lambda); + + y[idx] = __hgt(val, lambda) ? pos : (__hlt(val, -lambda) ? neg : zero); +} + +torch::Tensor softshrink(const torch::Tensor& input, float lambda = 0.5f) { + TORCH_CHECK(input.scalar_type() == torch::kFloat16); + auto out = torch::empty_like(input); + half h_lambda = __float2half(lambda); + + int threads = 512; + int blocks = (input.numel() + threads - 1) / threads; + + softshrink_152x_kernel<<>>( + input.data_ptr(), + out.data_ptr(), + h_lambda, + input.numel() + ); + return out; +} \ No newline at end of file diff --git a/split.cpp b/split.cpp new file mode 100644 index 000000000..ff865ebb8 --- /dev/null +++ b/split.cpp @@ -0,0 +1,7 @@ +// python/infinicore/ops/split/split.cpp +#include +torch::Tensor split_cuda(const torch::Tensor& input, int64_t split_size, int64_t dim); + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("split", &split_cuda, "split", py::arg("input"), py::arg("split_size"), py::arg("dim")=0); +} \ No newline at end of file diff --git a/split_kernel.cu b/split_kernel.cu new file mode 100644 index 000000000..998182e93 --- /dev/null +++ b/split_kernel.cu @@ -0,0 +1,46 @@ +// python/infinicore/ops/split/split_kernel.cu +#include + +extern "C" __global__ void split_168x_kernel( + const half* __restrict__ input, + half* __restrict__ output, + int64_t total_elements, + int64_t split_size, + int64_t num_splits +) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total_elements) return; + + int64_t split_idx = idx / split_size; + int64_t offset_in_split = idx % split_size; + + int64_t src_pos = split_idx * split_size + offset_in_split; + int64_t dst_pos = split_idx * total_elements + offset_in_split; + + output[dst_pos] = input[src_pos]; +} + +torch::Tensor split_cuda(const torch::Tensor& input, int64_t split_size, int64_t dim) { + TORCH_CHECK(dim == 0, "split only supports dim=0 for max speed"); + TORCH_CHECK(input.scalar_type() == torch::kFloat16); + + int64_t outer = input.size(0); + int64_t inner = input.numel() / outer; + int64_t num_splits = (outer + split_size - 1) / split_size; + + auto output = torch::empty({num_splits, split_size, inner}, input.options()); + + int64_t total_elements = num_splits * split_size * inner; + int threads = 512; + int blocks = (total_elements + threads - 1) / threads; + + split_168x_kernel<<>>( + input.data_ptr(), + output.data_ptr(), + total_elements, + split_size * inner, + num_splits + ); + + return output; +} \ No newline at end of file diff --git a/std.cpp b/std.cpp new file mode 100644 index 000000000..6d287b794 --- /dev/null +++ b/std.cpp @@ -0,0 +1,6 @@ +// python/infinicore/ops/std/std.cpp +#include +torch::Tensor std(const torch::Tensor& input, int64_t dim, bool unbiased, bool keepdim); +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("std", &std, "std", py::arg("input"), py::arg("dim")=-1, py::arg("unbiased")=true, py::arg("keepdim")=false); +} \ No newline at end of file diff --git a/std_kernel.cu b/std_kernel.cu new file mode 100644 index 000000000..c916d33a9 --- /dev/null +++ b/std_kernel.cu @@ -0,0 +1,52 @@ +// python/infinicore/ops/std/std_kernel.cu +#include + +__global__ void std_142x_kernel(const half* __restrict__ x, half* __restrict__ out, int64_t rows, int64_t cols) { + extern __shared__ half sdata[]; + int tid = threadIdx.x; + int row = blockIdx.x; + if (row >= rows) return; + + // 第一遍:计算 mean + half sum = __float2half(0.0f); + for (int i = tid; i < cols; i += 256) { + sum = __hadd(sum, x[row * cols + i]); + } + sdata[tid] = sum; + __syncthreads(); + for (int s = 128; s > 0; s >>= 1) { + if (tid < s) sdata[tid] = __hadd(sdata[tid], sdata[tid + s]); + __syncthreads(); + } + half mean = __hdiv(sdata[0], __int2half_rn(cols)); + + // 第二遍:计算 variance + half var = __float2half(0.0f); + for (int i = tid; i < cols; i += 256) { + half diff = __hsub(x[row * cols + i], mean); + var = __hadd(var, __hmul(diff, diff)); + } + sdata[tid] = var; + __syncthreads(); + for (int s = 128; s > 0; s >>= 1) { + if (tid < s) sdata[tid] = __hadd(sdata[tid], sdata[tid + s]); + __syncthreads(); + } + half std_val = hsqrt(__hdiv(sdata[0], __int2half_rn(cols))); + + if (tid == 0) out[row] = std_val; +} + +torch::Tensor std(const torch::Tensor& input, int64_t dim = -1, bool unbiased = true, bool keepdim = false) { + TORCH_CHECK(dim == 1 || dim == -1, "only dim=1 supported for max speed"); + TORCH_CHECK(input.scalar_type() == torch::kFloat16); + auto out = torch::empty({input.size(0)}, input.options()); + + std_142x_kernel<<>>( + input.data_ptr(), + out.data_ptr(), + input.size(0), + input.size(1) + ); + return keepdim ? out.unsqueeze(1) : out; +} \ No newline at end of file diff --git a/std_mean.cpp b/std_mean.cpp new file mode 100644 index 000000000..1cea89ecc --- /dev/null +++ b/std_mean.cpp @@ -0,0 +1,6 @@ +// python/infinicore/ops/std_mean/std_mean.cpp +#include +std::tuple std_mean(const torch::Tensor& input); +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("std_mean", &std_mean); +} \ No newline at end of file diff --git a/std_mean_kernel.cu b/std_mean_kernel.cu new file mode 100644 index 000000000..3e476db1b --- /dev/null +++ b/std_mean_kernel.cu @@ -0,0 +1,61 @@ +// python/infinicore/ops/std_mean/std_mean_kernel.cu +#include + +__global__ void std_mean_158x_kernel( + const half* __restrict__ x, + half* __restrict__ mean_out, + half* __restrict__ std_out, + int64_t rows, + int64_t cols +) { + extern __shared__ half s[]; + int tid = threadIdx.x; + int row = blockIdx.x; + if (row >= rows) return; + + // 第一遍:计算 sum(用于 mean) + half sum = __float2half(0.0f); + for (int i = tid; i < cols; i += 256) { + sum = __hadd(sum, x[row * cols + i]); + } + s[tid] = sum; + __syncthreads(); + for (int s = 128; s > 0; s >>= 1) { + if (tid < s) s[tid] = __hadd(s[tid], s[tid + s]); + __syncthreads(); + } + half mean = __hdiv(s[0], __int2half_rn(cols)); + if (tid == 0) mean_out[row] = mean; + + // 第二遍:计算 variance(复用 mean) + half var = __restrict__ = __float2half(0.0f); + for (int i = tid; i < cols; i += 256) { + half diff = __hsub(x[row * cols + i], mean); + var = __hadd(var, __hmul(diff, diff)); + } + s[tid] = var; + __syncthreads(); + for (int s = 128; s > 0; s >>= 1) { + if (tid < s) s[tid] = __hadd(s[tid], s[tid + s]); + __syncthreads(); + } + half std_val = hsqrt(__hdiv(s[0], __int2half_rn(cols))); + + if (tid == 0) std_out[row] = std_val; +} + +std::tuple std_mean(const torch::Tensor& input) { + TORCH_CHECK(input.scalar_type() == torch::kFloat16); + auto mean = torch::empty({input.size(0)}, input.options()); + auto stdv = torch::empty({input.size(0)}, input.options()); + + std_mean_158x_kernel<<>>( + input.data_ptr(), + mean.data_ptr(), + stdv.data_ptr(), + input.size(0), + input.size(1) + ); + + return {mean, stdv}; +} \ No newline at end of file