Issue/259 softmax_cpu计算抽象减少冗余

Graylatzhou · Graylatzhou · commit e843a513b774 · 2025-07-09T14:16:34.000+08:00
diff --git a/include/infiniop/ops/softmax.h b/include/infiniop/ops/softmax.h
@@ -1,5 +1,5 @@
-#ifndef __INFINIOP_MLP_API_H__
-#define __INFINIOP_MLP_API_H__
+#ifndef __INFINIOP_SOFTMAX_API_H__
+#define __INFINIOP_SOFTMAX_API_H__
 
 #include "../operator_descriptor.h"
 
diff --git a/src/infiniop/ops/softmax/cpu/softmax_cpu.cc b/src/infiniop/ops/softmax/cpu/softmax_cpu.cc
@@ -41,47 +41,46 @@ void softmax_cpu(const SoftmaxInfo &info,
     int dimsize = info.dimsize;
     int stride = info.stride;
     int othersize = info.otherdim_size;
-    if constexpr (std::is_same_v<T, fp16_t>) {
-        auto input = reinterpret_cast<const fp16_t *>(x);
-        auto output = reinterpret_cast<fp16_t *>(y);
-        for (int i = 0; i < othersize; i++) {
-            int tid = i % stride + (i - i % stride) * dimsize;
-            float max_data = -INFINITY;
-            for (int j = 0; j < dimsize; j++) {
-                int index = tid + j * stride;
-                max_data = fmax(max_data, utils::cast<float>(input[index]));
-            }
-            float sum_data = 0.0f;
-            for (int j = 0; j < dimsize; j++) {
-                int index = tid + j * stride;
-                sum_data += std::exp(utils::cast<float>(input[index]) - max_data);
-            }
-            for (int j = 0; j < dimsize; j++) {
-                int index = tid + j * stride;
-                output[index] = utils::cast<fp16_t>(std::exp(utils::cast<float>(input[index]) - max_data) / sum_data);
-            }
+    auto to_float = [](const T &val) -> float {
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            return utils::cast<float>(val);
+        } else {
+            return val;
         }
-    } else if constexpr (std::is_same_v<T, float>) {
-        auto input = reinterpret_cast<const float *>(x);
-        auto output = reinterpret_cast<float *>(y);
-#pragma omp parallel for
-        for (int i = 0; i < othersize; i++) {
-            int tid = i % stride + (i - i % stride) * dimsize;
-            float max_data = -INFINITY;
-            for (int j = 0; j < dimsize; j++) {
-                int index = tid + j * stride;
-                max_data = fmax(max_data, input[index]);
-            }
-            float sum_data = 0.0f;
-            for (int j = 0; j < dimsize; j++) {
-                int index = tid + j * stride;
-                sum_data += std::exp(input[index] - max_data);
-            }
-            for (int j = 0; j < dimsize; j++) {
-                int index = tid + j * stride;
-                output[index] = std::exp(input[index] - max_data) / sum_data;
-            }
+    };
+
+    auto from_float = [](float val) -> T {
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            return utils::cast<fp16_t>(val);
+        } else {
+            return val;
+        }
+    };
+
+    auto input = reinterpret_cast<const T *>(x);
+    auto output = reinterpret_cast<T *>(y);
+
+    auto compute_softmax = [&](int i) {
+        int tid = i % stride + (i - i % stride) * dimsize;
+        float max_data = -INFINITY;
+        for (int j = 0; j < dimsize; j++) {
+            int index = tid + j * stride;
+            max_data = fmax(max_data, to_float(input[index]));
+        }
+        float sum_data = 0.0f;
+        for (int j = 0; j < dimsize; j++) {
+            int index = tid + j * stride;
+            sum_data += std::exp(to_float(input[index]) - max_data);
         }
+        for (int j = 0; j < dimsize; j++) {
+            int index = tid + j * stride;
+            float result = std::exp(to_float(input[index]) - max_data) / sum_data;
+            output[index] = from_float(result);
+        }
+    };
+#pragma omp parallel for
+    for (int i = 0; i < othersize; i++) {
+        compute_softmax(i);
     }
 }
 
diff --git a/src/infiniop/ops/softmax/cuda/softmax_kernel.cuh b/src/infiniop/ops/softmax/cuda/softmax_kernel.cuh
@@ -1,5 +1,6 @@
 #ifndef __SOFTMAX_CUDA_KERNEL_H__
 #define __SOFTMAX_CUDA_KERNEL_H__
+
 #include "../../../devices/cuda/cuda_kernel_common.cuh"
 #include "softmax_cuda.cuh"
 #include <cub/block/block_reduce.cuh>
@@ -74,7 +75,6 @@ i 也就是 (blockIdx.x * blockDim.y + threadIdx.y) / stride
 j 也就是 (blockIdx.x * blockDim.y + threadIdx.y) % stride
 然后i转化为线性也就是 i * stride * dimsize
 j直接加上就好
-
 */
 template <int elemPerThread, int BLOCK_DIM_Y, int BLOCK_DIM_X, typename T>
 __global__ void Softmax_warp_impl(const T *x, T *y, int stride, int dimsize, int otherdim_size) {
@@ -236,4 +236,4 @@ infiniStatus_t softmax_dispatch(const op::softmax::SoftmaxInfo &info, void *y, c
     return INFINI_STATUS_SUCCESS;
 }
 
-#endif // __SOFTMAX_CUDA_KERNEL_H__
+#endif // __SOFTMAX_CUDA_KERNEL_H__
diff --git a/src/infiniop/ops/softmax/info.h b/src/infiniop/ops/softmax/info.h
@@ -1,5 +1,5 @@
-#ifndef __CONV_INFO_H__
-#define __CONV_INFO_H__
+#ifndef __SOFTMAX_INFO_H__
+#define __SOFTMAX_INFO_H__
 
 #include "../../../utils.h"
 #include "../../operator.h"
@@ -44,4 +44,4 @@ class SoftmaxInfo {
 };
 } // namespace op::softmax
 
-#endif // __CONV_INFO_H__
+#endif // __SOFTMAX_INFO_H__
diff --git a/src/infiniop/ops/softmax/operator.cc b/src/infiniop/ops/softmax/operator.cc
@@ -107,4 +107,4 @@ infiniopDestroySoftmaxDescriptor(infiniopSoftmaxDescriptor_t desc) {
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
 #undef DELETE
-}
+}
diff --git a/src/infiniop/ops/softmax/softmax.h b/src/infiniop/ops/softmax/softmax.h
@@ -46,4 +46,4 @@
             void *stream) const;                                 \
     };                                                           \
     }
-#endif // __CONV_H__
+#endif // __SOFTMAX_H__

Original file line number	Diff line number	Diff line change
`@@ -107,4 +107,4 @@ infiniopDestroySoftmaxDescriptor(infiniopSoftmaxDescriptor_t desc) {`
`107`	`107`	`return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;`
`108`	`108`	`}`
`109`	`109`	`#undef DELETE`
`110`		`-}`
	`110`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -46,4 +46,4 @@`
`46`	`46`	`void *stream) const; \`
`47`	`47`	`}; \`
`48`	`48`	`}`
`49`		`-#endif // __CONV_H__`
	`49`	`+#endif // __SOFTMAX_H__`