feat: 添加Scale算子

onenewcode · onenewcode · commit fc5a50dc406f · 2025-02-14T13:39:33.000+08:00
diff --git a/operators/src/lib.rs b/operators/src/lib.rs
@@ -18,6 +18,7 @@ pub mod random_sample;
 pub mod rearrange;
 pub mod rms_norm;
 pub mod rope;
+pub mod scale;
 pub mod swiglu;
 
 pub use common::*;
diff --git a/operators/src/scale/args.rs b/operators/src/scale/args.rs
@@ -0,0 +1,174 @@
+use crate::{
+    get_static, rank_mismatch, shape_mismatch, shape_not_support, utils::type_distinct, ConstPtr,
+    Hardware, MutPtr, SchemeError, TensorLayout,
+};
+use digit_layout::DigitLayout;
+use itertools::izip;
+use std::{
+    cmp::Ordering,
+    ptr::{null, null_mut},
+};
+
+#[derive(Clone)]
+pub struct Args<H: Hardware> {
+    pub c_layout: TensorLayout,
+    pub c_base: MutPtr<H>,
+    pub a_layout: TensorLayout,
+    pub a_base: ConstPtr<H>,
+    pub scale: f32,
+}
+
+impl<H: Hardware> Args<H> {
+    pub fn new_null(
+        c_layout: TensorLayout,
+        a_layout: TensorLayout,
+        b_layout: TensorLayout,
+    ) -> Self {
+        Self {
+            c_layout,
+            c_base: null_mut(),
+            a_layout,
+            a_base: null(),
+            scale: 1.0,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub(super) struct Scheme(DigitLayout, Box<[isize]>);
+
+impl Scheme {
+    pub fn new<H: Hardware>(args: &Args<H>) -> Result<Self, SchemeError> {
+        let Args {
+            c_layout: c,
+            a_layout: a,
+            ..
+        } = args;
+        // # 检查基本属性
+        let dt = type_distinct(&[c.dt(), a.dt()])?;
+        let ndim = c.ndim();
+        if a.ndim() != ndim {
+            return Err(rank_mismatch(format!(
+                "c.ndim = {}, a.ndim = {}",
+                c.ndim(),
+                a.ndim(),
+            )));
+        }
+        // # 输入形状
+        #[derive(Clone, PartialEq, Eq, Debug)]
+        struct Dim {
+            d: usize,
+            c: isize,
+            a: isize,
+        }
+        let mut dims = Vec::with_capacity(ndim);
+        for (&d, &da, &sc, &sa) in izip!(c.shape(), a.shape(), c.strides(), a.strides(),) {
+            get_static! {
+                d  da
+                sc sa
+            }
+            if da != d {
+                return Err(shape_mismatch(format!(
+                    "c: {:?}, a: {:?}",
+                    c.shape(),
+                    a.shape(),
+                )));
+            }
+            // 剔除初始的 1 长维度
+            if d != 1 {
+                if sc == 0 {
+                    return Err(shape_not_support("Reducing is not allowed for scale"));
+                }
+                dims.push(Dim { d, c: sc, a: sa })
+            }
+        }
+        // # 排序
+        dims.sort_unstable_by(|dim0, dim1| {
+            let &Dim {
+                d: d0,
+                c: c0,
+                a: a0,
+            } = dim0;
+            let &Dim {
+                d: d1,
+                c: c1,
+                a: a1,
+            } = dim1;
+            use Ordering::Equal as Eq;
+            match c0.abs().cmp(&c1.abs()) {
+                Eq => match a0.abs().cmp(&a1.abs()) {
+                    ord => ord.reverse(),
+                },
+                ord => ord.reverse(),
+            }
+        });
+        // # 合并连续维度
+        let mut ndim = dims.len();
+        for i in (1..dims.len()).rev() {
+            let (head, tail) = dims.split_at_mut(i);
+            let f = &mut head[i - 1]; // f for front
+            let b = &mut tail[0]; // b for back
+            let d = b.d as isize;
+            if b.c * d == f.c && b.a * d == f.a {
+                *f = Dim { d: b.d * f.d, ..*b };
+                *b = Dim { d: 1, c: 0, a: 0 };
+                ndim -= 1
+            }
+        }
+        // # 合并空间
+        let mut layout = vec![0isize; 1 + ndim * 4].into_boxed_slice();
+        {
+            let (idx, tail) = layout.split_at_mut(1 + ndim);
+            let (c_, tail) = tail.split_at_mut(ndim);
+            let (a_, b_) = tail.split_at_mut(ndim);
+            for (Dim { d, c, a }, idx, c_, a_) in
+                izip!(dims.into_iter().filter(|d| d.d != 1), &mut *idx, c_, a_)
+            {
+                *idx = d as _;
+                *c_ = c;
+                *a_ = a;
+            }
+            idx[ndim] = 1;
+            for i in (1..=ndim).rev() {
+                idx[i - 1] *= idx[i];
+            }
+        }
+        Ok(Self(dt, layout))
+    }
+
+    #[inline]
+    pub const fn dt(&self) -> DigitLayout {
+        self.0
+    }
+
+    /// 执行方案维数。
+    #[inline]
+    pub fn ndim(&self) -> usize {
+        (self.1.len() - 1) / 4
+    }
+
+    /// 读写单元数量。
+    #[inline]
+    pub fn count(&self) -> usize {
+        self.1[0] as _
+    }
+
+    /// 索引步长。
+    #[inline]
+    pub fn idx_strides(&self) -> &[isize] {
+        let ndim = self.ndim();
+        &self.1[1..][..ndim]
+    }
+
+    #[inline]
+    pub fn c_strides(&self) -> &[isize] {
+        let ndim = self.ndim();
+        &self.1[1 + ndim..][..ndim]
+    }
+
+    #[inline]
+    pub fn a_strides(&self) -> &[isize] {
+        let ndim = self.ndim();
+        &self.1[1 + ndim * 2..][..ndim]
+    }
+}
diff --git a/operators/src/scale/common_cpu/mod.rs b/operators/src/scale/common_cpu/mod.rs
@@ -0,0 +1,71 @@
+use super::{args::Scheme, Args, Scale};
+use crate::{common_cpu::Cpu, ByteOf, LaunchError, QueueAlloc, SchemeError};
+use digit_layout::types as ty;
+use half::f16;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+pub struct Operator;
+
+impl Scale<Cpu> for Operator {}
+
+impl crate::Operator for Operator {
+    type Hardware = Cpu;
+    type TopoNode = Cpu;
+    type Args = Args<Cpu>;
+
+    #[inline]
+    fn new(_node: &Self::TopoNode) -> Self {
+        Self
+    }
+    #[inline]
+    fn scheme(
+        &mut self,
+        _args: &Self::Args,
+        _max_workspace_size: usize,
+    ) -> Result<usize, SchemeError> {
+        Ok(0)
+    }
+
+    fn launch<QA>(
+        &self,
+        args: &Self::Args,
+        _workspace: &mut [ByteOf<Self::Hardware>],
+        _queue_alloc: &QA,
+    ) -> Result<(), LaunchError>
+    where
+        QA: QueueAlloc<Hardware = Self::Hardware>,
+    {
+        let scheme = Scheme::new(args)?;
+        let c = args.c_base as isize;
+        let a = args.a_base as isize;
+        let s = args.scale;
+        let idx_strides = scheme.idx_strides();
+        let c_strides = scheme.c_strides();
+        let a_strides = scheme.a_strides();
+        (0..scheme.count() as isize)
+            .into_par_iter()
+            .for_each(|mut rem| {
+                let mut c = c;
+                let mut a = a;
+                for (i, &s) in idx_strides.iter().enumerate() {
+                    let k = rem / s;
+                    c += k * c_strides[i];
+                    a += k * a_strides[i];
+                    rem %= s;
+                }
+                match scheme.dt() {
+                    ty::F16 => mul::<f16>(c, a, f16::from_f32(s)),
+                    ty::F32 => mul::<f32>(c, a, s),
+                    ty::F64 => mul::<f64>(c, a, s as f64),
+                    _ => todo!(),
+                }
+            });
+        Ok(())
+    }
+}
+
+fn mul<T: std::ops::Mul<Output = T>>(c: isize, a: isize, s: T) {
+    let c = c as *mut T;
+    let a = a as *const T;
+    unsafe { *c = a.read() * s }
+}
diff --git a/operators/src/scale/cuda/mod.rs b/operators/src/scale/cuda/mod.rs
@@ -0,0 +1,49 @@
+use super::{args::Scheme, Args, Scale};
+use crate::{
+    cuda::{dt_name, Gpu, Handle, ModuleBox},
+    shape_not_support, strides_not_support,
+    utils::{gcd, type_distinct},
+    ByteOf, LaunchError, QueueAlloc, SchemeDiversity, SchemeError,
+};
+use digit_layout::DigitLayout;
+use lru::LruCache;
+use std::{
+    ffi::{c_uint, CString},
+    sync::{Arc, Mutex},
+};
+
+pub struct Operator {}
+impl Scale<Gpu> for Operator {}
+
+impl crate::Operator for Operator {
+    type Hardware = Gpu;
+    type TopoNode = Gpu;
+    type Args = Args<Gpu>;
+
+    fn new(node: &Self::TopoNode) -> Self {
+        Self {}
+    }
+
+    #[inline]
+    fn scheme(
+        &mut self,
+        args: &Self::Args,
+        _max_workspace_size: usize,
+    ) -> Result<usize, SchemeError> {
+        todo!();
+        Ok(0)
+    }
+
+    fn launch<QA>(
+        &self,
+        _args: &Self::Args,
+        _workspace: &mut [ByteOf<Self::Hardware>],
+        _queue_alloc: &QA,
+    ) -> Result<(), LaunchError>
+    where
+        QA: QueueAlloc<Hardware = Self::Hardware>,
+    {
+        todo!();
+        Ok(())
+    }
+}
diff --git a/operators/src/scale/infini/mod.rs b/operators/src/scale/infini/mod.rs
@@ -0,0 +1,36 @@
+use super::{Args, Scale};
+use crate::{infini::Device, ByteOf, LaunchError, QueueAlloc, SchemeError};
+
+pub struct Operator;
+
+impl Add<Device> for Operator {}
+
+impl crate::Operator for Operator {
+    type Hardware = Device;
+    type TopoNode = Device;
+    type Args = Args<Device>;
+
+    fn new(_node: &Self::TopoNode) -> Self {
+        todo!()
+    }
+
+    fn scheme(
+        &mut self,
+        _args: &Self::Args,
+        _max_workspace_size: usize,
+    ) -> Result<usize, SchemeError> {
+        todo!()
+    }
+
+    fn launch<QA>(
+        &self,
+        _args: &Self::Args,
+        _workspace: &mut [ByteOf<Self::Hardware>],
+        _queue_alloc: &QA,
+    ) -> Result<(), LaunchError>
+    where
+        QA: QueueAlloc<Hardware = Self::Hardware>,
+    {
+        todo!()
+    }
+}
diff --git a/operators/src/scale/mod.rs b/operators/src/scale/mod.rs
@@ -0,0 +1,15 @@
+﻿//! c =scale*a
+
+#[cfg(any(use_cpu, test))]
+pub mod common_cpu;
+#[cfg(use_cuda)]
+pub mod cuda;
+#[cfg(use_infini)]
+pub mod infini;
+#[cfg(use_cl)]
+pub mod opencl;
+
+mod args;
+pub use args::Args;
+
+crate::op_trait!(Scale);
diff --git a/operators/src/scale/opencl/mod.rs b/operators/src/scale/opencl/mod.rs
@@ -0,0 +1,36 @@
+use super::{Args, Scale};
+use crate::{opencl::ClDevice, ByteOf, LaunchError, QueueAlloc, SchemeError};
+
+pub struct Operator;
+
+impl Scale<ClDevice> for Operator {}
+
+impl crate::Operator for Operator {
+    type Hardware = ClDevice;
+    type TopoNode = ClDevice;
+    type Args = Args<ClDevice>;
+
+    fn new(_node: &Self::TopoNode) -> Self {
+        todo!()
+    }
+
+    fn scheme(
+        &mut self,
+        _args: &Self::Args,
+        _max_workspace_size: usize,
+    ) -> Result<usize, SchemeError> {
+        todo!()
+    }
+
+    fn launch<QA>(
+        &self,
+        _args: &Self::Args,
+        _workspace: &mut [ByteOf<Self::Hardware>],
+        _queue_alloc: &QA,
+    ) -> Result<(), LaunchError>
+    where
+        QA: QueueAlloc<Hardware = Self::Hardware>,
+    {
+        todo!()
+    }
+}