diff --git a/.github/workflows/runtime-build.yml b/.github/workflows/runtime-build.yml
index 2a79eaef9e..5b4c7faefd 100644
--- a/.github/workflows/runtime-build.yml
+++ b/.github/workflows/runtime-build.yml
@@ -1,6 +1,12 @@
 name: runtime-build
 
-on: [ push, pull_request ]
+on:
+  push:
+    paths:
+      - 'ntt/**'
+  pull_request:
+    paths:
+      - 'ntt/**'
 
 concurrency:
   group: runtime-build-${{ github.ref }}
diff --git a/conanfile.py b/conanfile.py
index b203641206..0aa78bd093 100644
--- a/conanfile.py
+++ b/conanfile.py
@@ -73,9 +73,16 @@ def configure(self):
         if not self.options.runtime:
             if self.settings.os == 'Windows' and self.settings.build_type == 'Debug':
                 self.options["nethost"].shared = True
+            else:
+                # For Linux and other platforms, use static linking to avoid auditwheel issues
+                self.options["nethost"].shared = False
+            
+            # Configure fmt to be static for Linux builds to avoid auditwheel issues
+            if self.settings.os == 'Linux':
+                self.options["fmt"].shared = False
 
         if self.options.tests:
-            self.options["ortki"].shared = True
+            self.options["ortki"].shared = False
             self.options["date"].header_only = True
         
     def validate(self):
diff --git a/ntt/CMakeLists.txt b/ntt/CMakeLists.txt
index 1de56acea3..4854869b80 100644
--- a/ntt/CMakeLists.txt
+++ b/ntt/CMakeLists.txt
@@ -7,5 +7,5 @@ if(BUILD_TESTING)
 endif()
 
 if(BUILD_BENCHMARK)
-    add_subdirectory(test/benchmark_test)
+    # add_subdirectory(test/benchmark_test)
 endif()
diff --git a/ntt/include/nncase/bfloat16.h b/ntt/include/nncase/bfloat16.h
index 88c3750603..8bb9f63c5d 100644
--- a/ntt/include/nncase/bfloat16.h
+++ b/ntt/include/nncase/bfloat16.h
@@ -43,6 +43,12 @@ struct bfloat16 {
     constexpr operator __bf16() const noexcept {
         return std::bit_cast<__bf16>(value_);
     }
+// #else
+//     constexpr operator float() const noexcept {
+//         uint32_t value = raw() << 16;
+//         return std::bit_cast<float>(value);
+//     }
+
 #endif
 
     constexpr bfloat16() noexcept = default;
@@ -53,25 +59,6 @@ struct bfloat16 {
     constexpr explicit bfloat16(const T &v) noexcept
         : value_(round_to_bfloat16(v).value_) {}
 
-    constexpr bfloat16(from_raw_t, uint16_t value) noexcept : value_(value) {}
-
-    constexpr operator float() const noexcept {
-        uint32_t value = raw() << 16;
-        return std::bit_cast<float>(value);
-    }
-
-    constexpr uint16_t raw() const noexcept { return value_; }
-
-    static constexpr bfloat16 from_raw(uint16_t v) noexcept {
-        return bfloat16(nncase::from_raw, v);
-    }
-
-    static constexpr bfloat16 truncate_to_bfloat16(float v) noexcept {
-        return !std::isnan(v) ? from_raw(static_cast<uint16_t>(
-                                    std::bit_cast<uint32_t>(v) >> 16))
-                              : nan();
-    }
-
     // Converts a float point to bfloat16, with round-nearest-to-even as
     // rounding method.
     static constexpr bfloat16 round_to_bfloat16(float v) {
@@ -93,6 +80,90 @@ struct bfloat16 {
         }
     }
 
+    // Integer conversion constructors
+    constexpr explicit bfloat16(int x) noexcept
+        : value_(round_to_bfloat16(float(x)).value_) {}
+
+    constexpr explicit bfloat16(int64_t x) noexcept
+        : value_(round_to_bfloat16(float(x)).value_) {}
+
+    constexpr explicit bfloat16(uint32_t x) noexcept
+        : value_(round_to_bfloat16(float(x)).value_) {}
+
+    constexpr explicit bfloat16(uint64_t x) noexcept
+        : value_(round_to_bfloat16(double(x)).value_) {}
+
+    constexpr explicit bfloat16(float x) noexcept
+        : value_(round_to_bfloat16((x)).value_) {}
+    // Floating point conversion constructors
+    constexpr explicit bfloat16(double x) noexcept
+        : value_(round_to_bfloat16(float(x)).value_) {}
+
+    constexpr bfloat16(from_raw_t, uint16_t value) noexcept : value_(value) {}
+
+    constexpr operator float() const noexcept {
+        uint32_t value = raw() << 16;
+        return std::bit_cast<float>(value);
+    }
+
+    constexpr uint16_t raw() const noexcept { return value_; }
+
+    static constexpr bfloat16 from_raw(uint16_t v) noexcept {
+        return bfloat16(nncase::from_raw, v);
+    }
+
+    // Type conversion operators
+    constexpr explicit operator double() const noexcept {
+        return double(float(*this));
+    }
+
+    constexpr explicit operator int() const noexcept {
+        return int(float(*this));
+    }
+
+    constexpr explicit operator int64_t() const noexcept {
+        return int64_t(float(*this));
+    }
+
+    constexpr explicit operator uint32_t() const noexcept {
+        return uint32_t(float(*this));
+    }
+
+    constexpr explicit operator uint64_t() const noexcept {
+        return uint64_t(double(*this));
+    }
+
+
+    constexpr explicit operator uint8_t() const noexcept {
+        return uint8_t(float(*this));
+    }
+
+    constexpr explicit operator int8_t() const noexcept {
+        return int8_t(float(*this));
+    }
+
+
+    constexpr explicit operator int16_t() const noexcept {
+        return int16_t(float(*this));
+    }
+
+    constexpr explicit operator uint16_t() const noexcept {
+        return uint16_t(float(*this));
+    }
+
+
+    constexpr explicit operator bool() const noexcept {
+        return bool(std::bit_cast<uint16_t>(*this));
+    }
+
+    static constexpr bfloat16 truncate_to_bfloat16(float v) noexcept {
+        return !std::isnan(v) ? from_raw(static_cast<uint16_t>(
+                                    std::bit_cast<uint32_t>(v) >> 16))
+                              : nan();
+    }
+
+
+
     static constexpr bfloat16 epsilon() noexcept {
         // 0x1.0p-7
         return from_raw(0x3c00);
@@ -297,3 +368,4 @@ template <> struct is_arithmetic<bfloat16> : public true_type {};
 inline nncase::bfloat16 operator"" _bf16(long double x) {
     return nncase::bfloat16(float(x));
 }
+
diff --git a/ntt/include/nncase/float8.h b/ntt/include/nncase/float8.h
index 3cc06ed154..d262988bfd 100644
--- a/ntt/include/nncase/float8.h
+++ b/ntt/include/nncase/float8.h
@@ -79,7 +79,6 @@
 // #include "nncase/nncase.h"
 #include "bfloat16.h"
 #include "half.h"
-#include "bfloat16.h"
 #ifndef CUTLASS_HOST_DEVICE
 #define CUTLASS_HOST_DEVICE inline
 #define CUTLASS_DEVICE inline
@@ -493,9 +492,6 @@ struct alignas(1) float_e4m3_t : float8_base<FloatEncoding::E4M3> {
     CUTLASS_HOST_DEVICE
     explicit float_e4m3_t(float x) { storage = from_float(x).storage; }
 
-    CUTLASS_HOST_DEVICE
-    explicit float_e4m3_t(bfloat16 x) : float_e4m3_t(float(x)) {}
-
     CUTLASS_HOST_DEVICE
     explicit float_e4m3_t(half x) { storage = from_half(x).storage; }
 
@@ -508,7 +504,17 @@ struct alignas(1) float_e4m3_t : float8_base<FloatEncoding::E4M3> {
     explicit float_e4m3_t(int x) : float_e4m3_t(float(x)) {}
 
     CUTLASS_HOST_DEVICE
-    explicit float_e4m3_t(size_t x) : float_e4m3_t(float(x)) {}
+    explicit float_e4m3_t(int64_t x) : float_e4m3_t(float(x)) {}
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(bfloat16 x) : float_e4m3_t(float(x)) {}
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(uint64_t x) : float_e4m3_t(double(x)) {}    
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(uint32_t x) : float_e4m3_t(float(x)) {}    
+
 
     /// E5M2 conversion. Defined after float_e5m2_t is defined.
     CUTLASS_HOST_DEVICE
@@ -704,11 +710,17 @@ struct alignas(1) float_e5m2_t : float8_base<FloatEncoding::E5M2> {
     explicit float_e5m2_t(int x) : float_e5m2_t(float(x)) {}
 
     CUTLASS_HOST_DEVICE
-    explicit float_e5m2_t(size_t x) : float_e5m2_t(float(x)) {}
+    explicit float_e5m2_t(uint64_t x) : float_e5m2_t(float(x)) {}
 
     CUTLASS_HOST_DEVICE
     explicit float_e5m2_t(bfloat16 x) : float_e5m2_t(float(x)) {}
 
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(int64_t x) : float_e5m2_t(float(x)) {}    
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(uint32_t x) : float_e5m2_t(float(x)) {}    
+
     /// E4M3 conversion
     CUTLASS_HOST_DEVICE
     explicit float_e5m2_t(float_e4m3_t x);
@@ -1025,7 +1037,8 @@ half operator*(float_e5m2_t const &lhs, float_e4m3_t const &rhs) {
     return half(float(lhs) * float(rhs));
 }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // float_e4m3_t <=> float_e5m2_t conversions
 //
diff --git a/ntt/include/nncase/half.h b/ntt/include/nncase/half.h
index 0be9e6be30..0a42541351 100644
--- a/ntt/include/nncase/half.h
+++ b/ntt/include/nncase/half.h
@@ -52,23 +52,62 @@ struct half {
     constexpr explicit half(const T &v) noexcept
         : value_(round_to_half(v).value_) {}
 
-    constexpr half(fp16_from_raw_t, uint16_t value) noexcept
-        : value_(std::bit_cast<_Float16>(value)) {}
 
-    constexpr operator _Float16() const noexcept { return value_; }
-    constexpr operator float() const noexcept {
+    static constexpr half round_to_half(float v) {
         if (std::is_constant_evaluated()) {
-            return (float)value_;
+            return (_Float16)v;
         } else {
 #ifdef __F16C__
-            // To avoid extendhfdf2
-            return _cvtsh_ss(raw());
+            // To avoid truncsfhf2
+            return from_raw(_cvtss_sh(v, _MM_FROUND_NEARBYINT));
 #else
-            return (float)value_;
+            return (_Float16)v;
 #endif
         }
+
+        return (_Float16)v;
     }
 
+    static constexpr half epsilon() noexcept { return from_raw(0x0800); }
+
+    // Integer conversion constructors
+    constexpr explicit half(int x) noexcept
+        : value_(round_to_half(float(x)).value_) {}
+
+    constexpr explicit half(int64_t x) noexcept
+        : value_(round_to_half(float(x)).value_) {}
+
+    constexpr explicit half(uint32_t x) noexcept
+        : value_(round_to_half(float(x)).value_) {}
+
+    constexpr explicit half(uint64_t x) noexcept
+        : value_(round_to_half(double(x)).value_) {}
+
+    // Floating point conversion constructors
+    constexpr explicit half(double x) noexcept
+        : value_(round_to_half(float(x)).value_) {}
+
+    // bfloat16 conversion constructor
+    constexpr explicit half(bfloat16 x) noexcept
+        : value_(round_to_half(float(x)).value_) {}
+
+    constexpr half(fp16_from_raw_t, uint16_t value) noexcept
+        : value_(std::bit_cast<_Float16>(value)) {}
+
+    constexpr operator _Float16() const noexcept { return value_; }
+//     constexpr operator float() const noexcept {
+//         if (std::is_constant_evaluated()) {
+//             return (float)value_;
+//         } else {
+// #ifdef __F16C__
+//             // To avoid extendhfdf2
+//             return _cvtsh_ss(raw());
+// #else
+//             return (float)value_;
+// #endif
+//         }
+//     }
+
     constexpr uint16_t raw() const noexcept {
         return std::bit_cast<uint16_t>(value_);
     }
@@ -77,22 +116,48 @@ struct half {
         return half(nncase::fp16_from_raw, v);
     }
 
-    static constexpr half round_to_half(float v) {
-        if (std::is_constant_evaluated()) {
-            return (_Float16)v;
-        } else {
-#ifdef __F16C__
-            // To avoid truncsfhf2
-            return from_raw(_cvtss_sh(v, _MM_FROUND_NEARBYINT));
-#else
-            return (_Float16)v;
-#endif
-        }
+    // Type conversion operators
+    constexpr explicit operator double() const noexcept {
+        return double(float(*this));
+    }
 
-        return (_Float16)v;
+    constexpr explicit operator int8_t() const noexcept {
+        return int(float(*this));
     }
 
-    static constexpr half epsilon() noexcept { return from_raw(0x0800); }
+    constexpr explicit operator uint8_t() const noexcept {
+        return int(float(*this));
+    }
+
+
+    constexpr explicit operator int16_t() const noexcept {
+        return int(float(*this));
+    }
+
+
+    constexpr explicit operator uint16_t() const noexcept {
+        return int(float(*this));
+    }
+    
+    constexpr explicit operator int() const noexcept {
+        return int(float(*this));
+    }
+
+    constexpr explicit operator int64_t() const noexcept {
+        return int64_t(float(*this));
+    }
+
+    constexpr explicit operator uint32_t() const noexcept {
+        return uint32_t(float(*this));
+    }
+
+    constexpr explicit operator uint64_t() const noexcept {
+        return uint64_t(double(*this));
+    }
+
+    constexpr explicit operator bool() const noexcept {
+        return bool(std::bit_cast<uint16_t>(*this));
+    }
 
     static constexpr half highest() noexcept { return from_raw(0x7bff); }
 
diff --git a/ntt/include/nncase/ntt/arch/riscv64/primitive_ops.h b/ntt/include/nncase/ntt/arch/riscv64/primitive_ops.h
index 9b2b4f3eaa..31a3448588 100644
--- a/ntt/include/nncase/ntt/arch/riscv64/primitive_ops.h
+++ b/ntt/include/nncase/ntt/arch/riscv64/primitive_ops.h
@@ -859,17 +859,20 @@ REGISTER_RVV_BINARY_OP(max, float, max_float32)
     inline vfloat32m##lmul##_t pow_float32(const vfloat32m##lmul##_t &v1,      \
                                            const vfloat32m##lmul##_t &v2,      \
                                            const size_t vl) {                  \
+        COMPILER_BARRIER();                     \
         return pow_ps(v1, v2, vl);                                             \
     }                                                                          \
                                                                                \
     inline vfloat32m##lmul##_t pow_float32(const vfloat32m##lmul##_t &v1,      \
                                            const float &s, const size_t vl) {  \
+        COMPILER_BARRIER();                     \
         auto v2 = __riscv_vfmv_v_f_f32m##lmul(s, vl);                          \
         return pow_ps(v1, v2, vl);                                             \
     }                                                                          \
                                                                                \
     inline vfloat32m##lmul##_t pow_float32(                                    \
         const float &s, const vfloat32m##lmul##_t &v2, const size_t vl) {      \
+        COMPILER_BARRIER();                     \
         auto v1 = __riscv_vfmv_v_f_f32m##lmul(s, vl);                          \
         return pow_ps(v1, v2, vl);                                             \
     }
@@ -882,38 +885,73 @@ REGISTER_RVV_BINARY_OP(pow, float, pow_float32)
     inline vint32m##lmul##_t floor_mod_int32(const vint32m##lmul##_t &v1,      \
                                              const vint32m##lmul##_t &v2,      \
                                              const size_t vl) {                \
+        /*if no fence, the result would be  incorrect on large testcases*/      \
         auto remainder = __riscv_vrem_vv_i32m##lmul(v1, v2, vl);               \
         auto tmp = __riscv_vxor_vv_i32m##lmul(v1, v2, vl);                     \
         auto mask1 = __riscv_vmsne_vx_i32m##lmul##_b##mlen(remainder, 0, vl);  \
         auto mask2 = __riscv_vmslt_vx_i32m##lmul##_b##mlen(tmp, 0, vl);        \
         mask1 = __riscv_vmand_mm_b##mlen(mask1, mask2, vl);                    \
-        remainder = __riscv_vadd_vv_i32m##lmul##_m(mask1, remainder, v2, vl);  \
+        /*remainder = __riscv_vadd_vv_i32m##lmul##_m(mask1, remainder, v2, vl);*/  \
+        asm volatile (              \
+            "vmv.v.v v0, %[mask]\n\t" \
+            "vadd.vv %[rem], %[rem], %[val], v0.t" \
+            : [rem] "+vr" (remainder) \
+            : [mask] "vr" (mask1), \
+              [val] "vr" (v2) \
+            : "v0" \
+        );  \
+        /* Debug output mask values */                                                                 \
+        /*
+        std::cout << "=== FLOOR_MOD_INT32 DEBUG ===" << std::endl;             \
+        print_rvv_vector_i32<NTT_VLEN/32>(v1, "v1", vl);                       \
+        print_rvv_vector_i32<NTT_VLEN/32>(v2, "v2", vl);                       \
+        print_rvv_vector_i32<NTT_VLEN/32>(remainder1, "remainder1", vl);          \
+        print_rvv_vector_i32<NTT_VLEN/32>(tmp, "tmp (v1^v2)", vl);              \
+        print_rvv_vector_i32<NTT_VLEN/32>(remainder2, "final result", vl);       \
+        std::cout << "=== END DEBUG ===" << std::endl;                         \
+        */ \
         return remainder;                                                      \
     }                                                                          \
                                                                                \
     inline vint32m##lmul##_t floor_mod_int32(                                  \
         const vint32m##lmul##_t &v1, const int32_t &s, const size_t vl) {      \
+        /*if no fence, the result would be  incorrect on large testcases*/      \
         auto remainder = __riscv_vrem_vx_i32m##lmul(v1, s, vl);                \
         auto tmp = __riscv_vxor_vx_i32m##lmul(v1, s, vl);                      \
         auto mask1 = __riscv_vmsne_vx_i32m##lmul##_b##mlen(remainder, 0, vl);  \
         auto mask2 = __riscv_vmslt_vx_i32m##lmul##_b##mlen(tmp, 0, vl);        \
         mask1 = __riscv_vmand_mm_b##mlen(mask1, mask2, vl);                    \
-        remainder = __riscv_vadd_vx_i32m##lmul##_m(mask1, remainder, s, vl);   \
+        asm volatile (              \
+            "vmv.v.v v0, %[mask]\n\t" \
+            "vadd.vx %[rem], %[rem], %[val], v0.t" \
+            : [rem] "+vr" (remainder) \
+            : [mask] "vr" (mask1), \
+              [val] "r" (s) \
+            : "v0" \
+        );  \
         return remainder;                                                      \
     }                                                                          \
                                                                                \
     inline vint32m##lmul##_t floor_mod_int32(                                  \
         const int32_t &s, const vint32m##lmul##_t &v2, const size_t vl) {      \
+        /*if no fence, the result would be  incorrect on large testcases*/      \
         auto v1 = __riscv_vmv_v_x_i32m##lmul(s, vl);                           \
         auto remainder = __riscv_vrem_vv_i32m##lmul(v1, v2, vl);               \
         auto tmp = __riscv_vxor_vv_i32m##lmul(v1, v2, vl);                     \
         auto mask1 = __riscv_vmsne_vx_i32m##lmul##_b##mlen(remainder, 0, vl);  \
         auto mask2 = __riscv_vmslt_vx_i32m##lmul##_b##mlen(tmp, 0, vl);        \
         mask1 = __riscv_vmand_mm_b##mlen(mask1, mask2, vl);                    \
-        remainder = __riscv_vadd_vv_i32m##lmul##_m(mask1, remainder, v2, vl);  \
+        asm volatile (              \
+            "vmv.v.v v0, %[mask]\n\t" \
+            "vadd.vv %[rem], %[rem], %[val], v0.t" \
+            : [rem] "+vr" (remainder) \
+            : [mask] "vr" (mask1), \
+              [val] "vr" (v2) \
+            : "v0" \
+        );  \
         return remainder;                                                      \
     }
-
+//Compiler or qemu error on rvv int32 floor_mod kernel.
 REGISTER_RVV_KERNEL(FLOOR_MOD_INT32)
 REGISTER_RVV_BINARY_OP(floor_mod, int32_t, floor_mod_int32)
 
diff --git a/ntt/include/nncase/ntt/arch/riscv64/primitive_ops_half.h b/ntt/include/nncase/ntt/arch/riscv64/primitive_ops_half.h
index 245d1f55e8..001ca48c51 100644
--- a/ntt/include/nncase/ntt/arch/riscv64/primitive_ops_half.h
+++ b/ntt/include/nncase/ntt/arch/riscv64/primitive_ops_half.h
@@ -17,6 +17,12 @@ namespace nncase::ntt::ops {
     kernel(1, 16) kernel(2, 8) kernel(4, 4) kernel(8, 2)
 #endif
 
+// float32 intermediate
+#ifndef REGISTER_RVV_FP16_KERNEL_FP32_IM
+#define REGISTER_RVV_FP16_KERNEL_FP32_IM(kernel)                                       \
+    kernel(1, 16) kernel(2, 8) kernel(4, 4) 
+#endif
+
 #define RVV_UNARY_FP16_OP(op, dtype, vl, kernel)                               \
     template <> struct op<ntt::vector<dtype, vl>> {                            \
         ntt::vector<dtype, vl>                                                 \
@@ -260,6 +266,7 @@ REGISTER_RVV_UNARY_FP16_OP(cosh, half, cosh_float16)
         auto vi = __riscv_vfcvt_x_f_v_i16m##lmul(v, vl);                       \
         auto vf = __riscv_vfcvt_f_x_v_f16m##lmul(vi, vl);                      \
         auto mask = __riscv_vmfgt_vv_f16m##lmul##_b##mlen(vf, v, vl);          \
+        __asm__ volatile("" ::: "memory"); \
         vf = __riscv_vfsub_vf_f16m##lmul##_m(mask, vf, 1.f16, vl);             \
         return vf;                                                             \
     }
@@ -535,6 +542,12 @@ REGISTER_RVV_UNARY_FP16_OP(erf, half, erf_float16)
                 RVV_BINARY_fp16_OP(op, dtype, NTT_VL(sizeof(dtype) * 8, *, 8), \
                                    kernel)
 
+//Fp32 as immidiate result
+#define REGISTER_RVV_BINARY_FP16_OPS_FP32_IM(op, dtype, kernel)                \
+    RVV_BINARY_fp16_OP(op, dtype, NTT_VL(sizeof(dtype) * 8, *, 1), kernel)     \
+        RVV_BINARY_fp16_OP(op, dtype, NTT_VL(sizeof(dtype) * 8, *, 2), kernel) \
+            RVV_BINARY_fp16_OP(op, dtype, NTT_VL(sizeof(dtype) * 8, *, 4),     \
+                    kernel)     
 // add
 #define ADD_FLOAT16(lmul, mlen)                                                \
     inline vfloat16m##lmul##_t add_float16(const vfloat16m##lmul##_t &v1,      \
@@ -642,39 +655,60 @@ REGISTER_RVV_BINARY_FP16_OP(div, half, div_float16)
 REGISTER_RVV_FP16_KERNEL(POW_FLOAT16)
 REGISTER_RVV_BINARY_FP16_OP(pow, half, pow_float16)
 
+#define LMUL_DBL_1 2
+#define LMUL_DBL_2 4
+#define LMUL_DBL_4 8
+
+#define CONCAT_IMPL(a, b) a##b
+#define CONCAT(a, b) CONCAT_IMPL(a, b)
+
+#define DOUBLE_LMUL(lmul) CONCAT(LMUL_DBL_, lmul)
+#define CALL_DBL_LMUL(name, lmul) CONCAT(name, DOUBLE_LMUL(lmul))
+
 // mod
 #define MOD_FLOAT16(lmul, mlen)                                                \
-    inline vfloat16m##lmul##_t mod_float16(const vfloat16m##lmul##_t &v1,      \
+     inline vfloat16m##lmul##_t mod_float16(const vfloat16m##lmul##_t &v1,      \
                                            const vfloat16m##lmul##_t &v2,      \
                                            const size_t vl) {                  \
-        auto quotient = __riscv_vfcvt_f_x_v_f16m##lmul(                        \
-            __riscv_vfcvt_rtz_x_f_v_i16m##lmul(                                \
-                __riscv_vfdiv_vv_f16m##lmul(v1, v2, vl), vl),                  \
-            vl);                                                               \
-        return __riscv_vfnmsub_vv_f16m##lmul(quotient, v2, v1, vl);            \
-    }                                                                          \
+        auto v1_f32 = CALL_DBL_LMUL(__riscv_vfwcvt_f_f_v_f32m, lmul)(v1, vl);       \
+        auto v2_f32 = CALL_DBL_LMUL(__riscv_vfwcvt_f_f_v_f32m, lmul)(v2, vl);       \
+        auto division_f32 = CALL_DBL_LMUL(__riscv_vfdiv_vv_f32m, lmul)(v1_f32, v2_f32, vl); \
+        auto quotient_int = CALL_DBL_LMUL(__riscv_vfcvt_rtz_x_f_v_i32m, lmul)(division_f32, vl); \
+        auto quotient_f32 = CALL_DBL_LMUL(__riscv_vfcvt_f_x_v_f32m, lmul)(quotient_int, vl); \
+        auto result_f32 = CALL_DBL_LMUL(__riscv_vfnmsub_vv_f32m, lmul)(quotient_f32, v2_f32, v1_f32, vl); \
+        auto result_f16 = __riscv_vfncvt_f_f_w_f16m##lmul(result_f32, vl);     \
+        return result_f16;                                                     \
+    } \
+                  \
                                                                                \
     inline vfloat16m##lmul##_t mod_float16(const vfloat16m##lmul##_t &v,       \
                                            const half &s, const size_t vl) {   \
-        auto quotient = __riscv_vfcvt_f_x_v_f16m##lmul(                        \
-            __riscv_vfcvt_rtz_x_f_v_i16m##lmul(                                \
-                __riscv_vfdiv_vf_f16m##lmul(v, s, vl), vl),                    \
-            vl);                                                               \
-        return __riscv_vfnmsub_vf_f16m##lmul(quotient, s, v, vl);              \
+        float s_f32 = (float)s;                                                \
+        auto v_f32 = CALL_DBL_LMUL(__riscv_vfwcvt_f_f_v_f32m, lmul)(v, vl);         \
+        auto division_f32 = CALL_DBL_LMUL(__riscv_vfdiv_vf_f32m, lmul)(v_f32, s_f32, vl);         \
+        auto quotient_int = CALL_DBL_LMUL(__riscv_vfcvt_rtz_x_f_v_i32m, lmul)(division_f32, vl);  \
+        auto quotient_f32 = CALL_DBL_LMUL(__riscv_vfcvt_f_x_v_f32m, lmul)(quotient_int, vl);      \
+        auto result_f32 = CALL_DBL_LMUL(__riscv_vfnmsub_vf_f32m, lmul)(quotient_f32, s_f32, v_f32, vl);       \
+        auto result_f16 = __riscv_vfncvt_f_f_w_f16m##lmul(result_f32, vl);       \
+        return result_f16;                                                         \
     }                                                                          \
                                                                                \
     inline vfloat16m##lmul##_t mod_float16(                                    \
         const half &s, const vfloat16m##lmul##_t &v2, const size_t vl) {       \
-        auto v1 = __riscv_vfmv_v_f_f16m##lmul(s, vl);                          \
-        auto quotient = __riscv_vfcvt_f_x_v_f16m##lmul(                        \
-            __riscv_vfcvt_rtz_x_f_v_i16m##lmul(                                \
-                __riscv_vfrdiv_vf_f16m##lmul(v2, s, vl), vl),                  \
-            vl);                                                               \
-        return __riscv_vfnmsub_vv_f16m##lmul(quotient, v2, v1, vl);            \
-    }
+        float s_f32 = (float)s;                                                \
+        auto v1_f32 = CALL_DBL_LMUL(__riscv_vfmv_v_f_f32m, lmul)(s_f32, vl);         \
+        auto v2_f32 = CALL_DBL_LMUL(__riscv_vfwcvt_f_f_v_f32m, lmul)(v2, vl);        \
+        auto division_f32 = CALL_DBL_LMUL(__riscv_vfrdiv_vf_f32m, lmul)(v2_f32, s_f32, vl); \
+        auto quotient_int = CALL_DBL_LMUL(__riscv_vfcvt_rtz_x_f_v_i32m, lmul)(division_f32, vl); \
+        auto quotient_f32 = CALL_DBL_LMUL(__riscv_vfcvt_f_x_v_f32m, lmul)(quotient_int, vl); \
+        auto result_f32 = CALL_DBL_LMUL(__riscv_vfnmsub_vv_f32m, lmul)(quotient_f32, v2_f32, v1_f32, vl); \
+        auto result_f16 = __riscv_vfncvt_f_f_w_f16m##lmul(result_f32, vl);     \
+        return result_f16;                                                     \
+    }                                                                          
+
 
-REGISTER_RVV_FP16_KERNEL(MOD_FLOAT16)
-REGISTER_RVV_BINARY_FP16_OP(mod, half, mod_float16)
+REGISTER_RVV_FP16_KERNEL_FP32_IM(MOD_FLOAT16)
+REGISTER_RVV_BINARY_FP16_OPS_FP32_IM(mod, half, mod_float16)
 
 // min
 // template <> struct min<half, half> {
@@ -748,7 +782,16 @@ REGISTER_RVV_BINARY_FP16_OP(max, half, max_float16)
         auto mask1 = __riscv_vmsne_vx_i16m##lmul##_b##mlen(remainder, 0, vl);  \
         auto mask2 = __riscv_vmslt_vx_i16m##lmul##_b##mlen(tmp, 0, vl);        \
         mask1 = __riscv_vmand_mm_b##mlen(mask1, mask2, vl);                    \
-        remainder = __riscv_vadd_vv_i16m##lmul##_m(mask1, remainder, v2, vl);  \
+        __asm__ volatile("" ::: "memory"); \
+/*        remainder = __riscv_vadd_vv_i16m##lmul##_m(mask1, remainder, v2, vl);  \ */ \
+        asm volatile (              \
+            "vmv.v.v v0, %[mask]\n\t" \
+            "vadd.vv %[rem], %[rem], %[val], v0.t" \
+            : [rem] "+vr" (remainder) \
+            : [mask] "vr" (mask1), \
+              [val] "vr" (v2) \
+            : "v0" \
+        );  \
         return remainder;                                                      \
     }                                                                          \
                                                                                \
@@ -759,7 +802,15 @@ REGISTER_RVV_BINARY_FP16_OP(max, half, max_float16)
         auto mask1 = __riscv_vmsne_vx_i16m##lmul##_b##mlen(remainder, 0, vl);  \
         auto mask2 = __riscv_vmslt_vx_i16m##lmul##_b##mlen(tmp, 0, vl);        \
         mask1 = __riscv_vmand_mm_b##mlen(mask1, mask2, vl);                    \
-        remainder = __riscv_vadd_vx_i16m##lmul##_m(mask1, remainder, s, vl);   \
+/*        remainder = __riscv_vadd_vv_i16m##lmul##_m(mask1, remainder, v2, vl);  \ */ \
+        asm volatile (              \
+            "vmv.v.v v0, %[mask]\n\t" \
+            "vadd.vx %[rem], %[rem], %[val], v0.t" \
+            : [rem] "+vr" (remainder) \
+            : [mask] "vr" (mask1), \
+              [val] "r" (s) \
+            : "v0" \
+        );  \
         return remainder;                                                      \
     }                                                                          \
                                                                                \
@@ -771,7 +822,15 @@ REGISTER_RVV_BINARY_FP16_OP(max, half, max_float16)
         auto mask1 = __riscv_vmsne_vx_i16m##lmul##_b##mlen(remainder, 0, vl);  \
         auto mask2 = __riscv_vmslt_vx_i16m##lmul##_b##mlen(tmp, 0, vl);        \
         mask1 = __riscv_vmand_mm_b##mlen(mask1, mask2, vl);                    \
-        remainder = __riscv_vadd_vv_i16m##lmul##_m(mask1, remainder, v2, vl);  \
+/*        remainder = __riscv_vadd_vv_i16m##lmul##_m(mask1, remainder, v2, vl);  \ */ \
+        asm volatile (              \
+            "vmv.v.v v0, %[mask]\n\t" \
+            "vadd.vv %[rem], %[rem], %[val], v0.t" \
+            : [rem] "+vr" (remainder) \
+            : [mask] "vr" (mask1), \
+              [val] "vr" (v2) \
+            : "v0" \
+        );  \
         return remainder;                                                      \
     }
 
diff --git a/ntt/include/nncase/ntt/arch/riscv64/rvv_mathfun.h b/ntt/include/nncase/ntt/arch/riscv64/rvv_mathfun.h
index 72a3f64086..33d7ac360f 100644
--- a/ntt/include/nncase/ntt/arch/riscv64/rvv_mathfun.h
+++ b/ntt/include/nncase/ntt/arch/riscv64/rvv_mathfun.h
@@ -15,10 +15,98 @@
 
 #pragma once
 #include <cmath>
+#define COMPILER_BARRIER() __asm__ volatile("" ::: "memory")
+
 
 #if __riscv_vector
 #include <riscv_vector.h>
 
+#ifdef DE_BUG
+#include <iostream>
+#include <iomanip>
+#include <limits>
+
+#define __RVV_PRINT_VECTOR_INT(LMUL, MLEN, TLEN) \
+    void print_rvv_vector_i##TLEN(const vint##TLEN##m##LMUL##_t &vec, const char *label, const size_t print_vl){ \
+        int##TLEN##_t temp[(LMUL*NTT_VLEN)/TLEN]; \
+        __riscv_vse##TLEN##_v_i##TLEN##m##LMUL(temp, vec, print_vl); \
+        std::cout << label << ": "; \
+        for (size_t i = 0; i < print_vl; ++i) { \
+            std::cout << temp[i] << " "; \
+        } \
+        std::cout << std::endl; \
+    }
+
+__RVV_PRINT_VECTOR_INT(1, 32, 32)
+__RVV_PRINT_VECTOR_INT(2, 16, 32)
+__RVV_PRINT_VECTOR_INT(4, 8, 32)
+__RVV_PRINT_VECTOR_INT(8, 4, 32)
+
+#define __RVV_PRINT_VECTOR_FLOAT(LMUL, MLEN, TLEN) \
+    void print_rvv_vector_f##TLEN(const vfloat##TLEN##m##LMUL##_t &vec, const char *label, const size_t print_vl){ \
+        float temp[(LMUL*NTT_VLEN/TLEN)]; \
+        __riscv_vse##TLEN##_v_f##TLEN##m##LMUL(temp, vec, print_vl); \
+        std::cout << label << ": "; \
+        for (size_t i = 0; i < print_vl; ++i) { \
+            std::cout << temp[i] << " "; \
+        } \
+        std::cout << std::endl; \
+    }
+
+__RVV_PRINT_VECTOR_FLOAT(1, 32, 32)
+__RVV_PRINT_VECTOR_FLOAT(2, 16, 32)
+__RVV_PRINT_VECTOR_FLOAT(4, 8, 32)
+__RVV_PRINT_VECTOR_FLOAT(8, 4, 32)
+
+
+#define __RVV_PRINT_VECTOR_HALF(LMUL, MLEN, TLEN) \
+    void print_rvv_vector_f##TLEN(const vfloat##TLEN##m##LMUL##_t &vec, const char *label, const size_t print_vl){ \
+        _Float16 temp[(LMUL*NTT_VLEN/TLEN)]; \
+        __riscv_vse##TLEN##_v_f##TLEN##m##LMUL(temp, vec, print_vl); \
+        std::cout << label << ": "; \
+        for (size_t i = 0; i < print_vl; ++i) { \
+            std::cout << std::setprecision(std::numeric_limits<float>::max_digits10) << temp[i] << " "; \
+        } \
+        std::cout << std::endl; \
+    }
+
+
+__RVV_PRINT_VECTOR_HALF(1, 16, 16)
+__RVV_PRINT_VECTOR_HALF(2, 8, 16)
+__RVV_PRINT_VECTOR_HALF(4, 4, 16)
+__RVV_PRINT_VECTOR_HALF(8, 2, 16)
+
+
+// template <size_t vl>
+// void print_rvv_vector_i32(const vint32m1_t &vec, const char *label, const size_t print_vl) {
+//     int32_t temp[vl];
+//     __riscv_vse32_v_i32m1(temp, vec, print_vl);
+//     std::cout << label << ": ";
+//     for (size_t i = 0; i < print_vl; ++i) {
+//         std::cout << temp[i] << " ";
+//     }
+//     std::cout << std::endl;
+// }
+
+
+#define __RVV_PRINT_MASK(BTYPE, MLEN) \
+    void print_rvv_mask_##MLEN(const vbool##MLEN##_t &mask, const char *label, const size_t print_vl) { \
+        uint8_t temp[MLEN]; \
+        __riscv_vsm_v_b##MLEN(temp, mask, print_vl); \
+        std::cout << label << ": "; \
+        for (size_t i = 0; i < print_vl; ++i) { \
+            std::cout << static_cast<int>(temp[i]) << " "; \
+        } \
+        std::cout << std::endl; \
+    }
+
+__RVV_PRINT_MASK(32, 32)
+__RVV_PRINT_MASK(16, 16)
+__RVV_PRINT_MASK(8, 8)
+__RVV_PRINT_MASK(4, 4)
+
+#endif
+
 #define c_inv_mant_mask ~0x7f800000u
 #define c_cephes_SQRTHF 0.707106781186547524
 #define c_cephes_log_p0 7.0376836292E-2
@@ -95,8 +183,8 @@ _RVV_FLOAT32_LOG_OP(2, 16)
 _RVV_FLOAT32_LOG_OP(4, 8)
 _RVV_FLOAT32_LOG_OP(8, 4)
 
-#define c_exp_hi 88.3762626647949f
-#define c_exp_lo -88.3762626647949f
+#define c_exp_hi 88.0f
+#define c_exp_lo -88.0f
 
 #define c_cephes_LOG2EF 1.44269504088896341
 #define c_cephes_exp_C1 0.693359375
@@ -382,12 +470,107 @@ _RVV_FLOAT_TANH_OP(2, 16, 32)
 _RVV_FLOAT_TANH_OP(4, 8, 32)
 _RVV_FLOAT_TANH_OP(8, 4, 32)
 
-#define _RVV_FLOAT_POW_OP(LMUL, MLEN, TLEN)                                    \
-    static inline vfloat##TLEN##m##LMUL##_t pow_ps(                            \
+#define _RVV_FLOAT_FLOOR_OP(LMUL, MLEN, TLEN)                                \
+    static inline vfloat##TLEN##m##LMUL##_t vfloor_v_f##TLEN##m##LMUL(        \
+        vfloat##TLEN##m##LMUL##_t val, size_t vl) {                          \
+        /* 1. Cast float to int(Round Towards Zero) */   \
+        vint##TLEN##m##LMUL##_t i_val =                                      \
+            __riscv_vfcvt_rtz_x_f_v_i##TLEN##m##LMUL(val, vl);               \
+        /* 2. Cast int back to float*/                 \
+        return __riscv_vfcvt_f_x_v_f##TLEN##m##LMUL(i_val, vl);              \
+    }
+_RVV_FLOAT_FLOOR_OP(1, 32, 32)
+_RVV_FLOAT_FLOOR_OP(2, 16, 32)
+
+const float fp32_inf = std::numeric_limits<float>::infinity();
+//To Reuse this blopck, following should be done:
+// 1. replace {i/f}32 to {i/f}TLEN
+// 2. using anthor macro to get the "twoPow24" or we say threshold for different float len
+#define __RVV_FLOAT32_IS_INTEGER(LMUL, MLEN)                           \
+    static inline vbool##MLEN##_t __vfloat32_is_integer_##LMUL(         \
+        vfloat32m##LMUL##_t v, size_t vl) {                               \
+        const float twoPow24 = 16777216.0f;                       \
+        /* huge float must have integer value */  \
+        auto v_abs = __riscv_vfabs_v_f32m##LMUL(v, vl);                 \
+        auto huge_float_flag = __riscv_vmfgt_vf_f32m##LMUL##_b##MLEN(v_abs, twoPow24, vl); \
+        auto v_is_not_inf_flag = __riscv_vmfne_vf_f32m##LMUL##_b##MLEN(v, fp32_inf, vl); \
+        huge_float_flag = __riscv_vmand_mm_b##MLEN(huge_float_flag, v_is_not_inf_flag, vl); \
+        auto v_to_int = __riscv_vfcvt_rtz_x_f_v_i32m##LMUL(v, vl);             \
+        auto back_to_float = __riscv_vfcvt_f_x_v_f32m##LMUL(v_to_int, vl);       \
+        auto is_int_flag = __riscv_vmfeq_vv_f32m##LMUL##_b##MLEN(v, back_to_float, vl); \
+        return __riscv_vmor_mm_b##MLEN(huge_float_flag, is_int_flag, vl);       \
+    }
+
+__RVV_FLOAT32_IS_INTEGER(1, 32)
+__RVV_FLOAT32_IS_INTEGER(2, 16)
+__RVV_FLOAT32_IS_INTEGER(4, 8)
+__RVV_FLOAT32_IS_INTEGER(8, 4)
+
+#define __RVV_FLOAT32_IS_EVEN(LMUL, MLEN)                       \
+    static inline vbool##MLEN##_t __vfloat32_is_even_##LMUL(                \
+        vfloat32m##LMUL##_t v, size_t vl) {                                   \
+        const float twoPow24 = 16777216.0f;                       \
+        auto v_abs = __riscv_vfabs_v_f32m##LMUL(v, vl);                 \
+        auto huge_float_flag = __riscv_vmfgt_vf_f32m##LMUL##_b##MLEN(v_abs, twoPow24, vl); \
+        auto v_is_not_inf_flag = __riscv_vmfne_vf_f32m##LMUL##_b##MLEN(v, fp32_inf, vl); \
+        huge_float_flag = __riscv_vmand_mm_b##MLEN(huge_float_flag, v_is_not_inf_flag, vl); \
+        /* test if v == ((int)v /2 * 2) */                                                                        \
+        auto v_to_int = __riscv_vfcvt_rtz_x_f_v_i32m##LMUL(v, vl);             \
+        auto v_to_int_div2 = __riscv_vsra_vx_i32m##LMUL(v_to_int, 1, vl);     \
+        auto v_div_mul_2 = __riscv_vsll_vx_i32m##LMUL(v_to_int_div2, 1, vl);   \
+        auto is_even_flag = __riscv_vmseq_vv_i32m##LMUL##_b##MLEN(v_to_int, v_div_mul_2, vl); \
+        return __riscv_vmor_mm_b##MLEN(huge_float_flag, is_even_flag, vl);                                                  \
+    }
+
+__RVV_FLOAT32_IS_EVEN(1, 32)
+__RVV_FLOAT32_IS_EVEN(2, 16)
+__RVV_FLOAT32_IS_EVEN(4, 8)
+__RVV_FLOAT32_IS_EVEN(8, 4)
+
+#define _RVV_FLOAT_POW_OP(LMUL, MLEN, TLEN)                                   \
+    static inline vfloat##TLEN##m##LMUL##_t pow_ps(                          \
         vfloat##TLEN##m##LMUL##_t a, vfloat##TLEN##m##LMUL##_t b, size_t vl) { \
-        /* pow(x, m) = exp(m * log(x)) */                                      \
-        return exp_ps(__riscv_vfmul_vv_f##TLEN##m##LMUL(b, log_ps(a, vl), vl), \
-                      vl);                                                     \
+        /* --- constants --- */                                              \
+        float scalar_nan = nanf("");   \
+        auto nan_vector = __riscv_vfmv_v_f_f##TLEN##m##LMUL(scalar_nan, vl); \
+        COMPILER_BARRIER();                                                         \
+        /* --- input a  --- */                                               \
+        auto neg_a_mask = __riscv_vmflt_vf_f##TLEN##m##LMUL##_b##MLEN(a, 0.f, vl); \
+        auto abs_a = __riscv_vfabs_v_f##TLEN##m##LMUL(a, vl);               \
+                                                                             \
+        /* ---  |a|^b --- */                                                 \
+        auto result = exp_ps(__riscv_vfmul_vv_f##TLEN##m##LMUL(b, log_ps(abs_a, vl), vl), vl); \
+        COMPILER_BARRIER(); \
+                                                                             \
+        /* --- handle a < 0 --- */                                           \
+        if(__riscv_vcpop_m_b##MLEN(neg_a_mask, vl) != 0) {                  \
+            auto b_int_mask = __vfloat32_is_integer_##LMUL(b, vl); \
+                                                                             \
+            auto b_even_mask = __vfloat32_is_even_##LMUL(b, vl);          \
+            auto b_not_even_mask = __riscv_vmnot_m_b##MLEN(b_even_mask, vl); \
+                                                                             \
+            /*  set to neg, a < 0 AND b is int AND b is not  even*/                           \
+            auto flip_sign_mask = __riscv_vmand_mm_b##MLEN(neg_a_mask, b_int_mask, vl); \
+                \
+            flip_sign_mask = __riscv_vmand_mm_b##MLEN(flip_sign_mask, b_not_even_mask, vl); \
+                                                                             \
+            COMPILER_BARRIER();                     \
+            /* set to NaN, a < 0 AND b is not an integer */                 \
+            auto is_not_int_mask = __riscv_vmnot_m_b##MLEN(b_int_mask, vl); \
+            auto set_nan_mask = __riscv_vmand_mm_b##MLEN(neg_a_mask, is_not_int_mask, vl); \
+                                                                             \
+            COMPILER_BARRIER(); \
+            /* --- use the masks to adjust the result --- */                \
+            /* a. set to neg */                                              \
+            auto neg_result = __riscv_vfneg_v_f##TLEN##m##LMUL##_m(flip_sign_mask, result, vl);  \
+                                                                             \
+            auto signed_result = __riscv_vmerge_vvm_f##TLEN##m##LMUL(result, neg_result, flip_sign_mask, vl); \
+            /* b. set to NaN */                                              \
+            result = __riscv_vmerge_vvm_f##TLEN##m##LMUL(signed_result, nan_vector, set_nan_mask, vl); \
+                        \
+        }                                                                    \
+                                                                             \
+        return result;                                                       \
     }
 
 _RVV_FLOAT_POW_OP(1, 32, 32)
@@ -722,4 +905,4 @@ _RVV_FLOAT_ERF_OP(1, 32, 32)
 _RVV_FLOAT_ERF_OP(2, 16, 32)
 _RVV_FLOAT_ERF_OP(4, 8, 32)
 _RVV_FLOAT_ERF_OP(8, 4, 32)
-#endif
\ No newline at end of file
+#endif
diff --git a/ntt/include/nncase/ntt/arch/x86_64/avx_mathfun.h b/ntt/include/nncase/ntt/arch/x86_64/avx_mathfun.h
index 0a23248cb6..c8fb4bd0d1 100644
--- a/ntt/include/nncase/ntt/arch/x86_64/avx_mathfun.h
+++ b/ntt/include/nncase/ntt/arch/x86_64/avx_mathfun.h
@@ -68,6 +68,10 @@ _PI32AVX_CONST(4, 4);
 
 _PS256_CONST(1, 1.0f);
 _PS256_CONST(0p5, 0.5f);
+_PS256_CONST(2,  2.0f);
+_PS256_CONST(nan,  NAN);
+
+
 /* the smallest non denormalized float number */
 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
 _PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
@@ -75,6 +79,7 @@ _PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
 
 _PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
 _PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+_PS256_CONST_TYPE(all_bits,  int, -1);      
 
 _PI32_CONST256(0, 0);
 _PI32_CONST256(1, 1);
@@ -748,9 +753,54 @@ static inline __m256 tan256_ps(__m256 x) {
     return ytan;
 }
 
+
+// static inline __m256 pow256_ps(__m256 a, __m256 b) {
+//     // pow(x, m) = exp(m * log(x))
+//     return exp256_ps(_mm256_mul_ps(b, log256_ps(a)));
+// }
 static inline __m256 pow256_ps(__m256 a, __m256 b) {
-    // pow(x, m) = exp(m * log(x))
-    return exp256_ps(_mm256_mul_ps(b, log256_ps(a)));
+    // --- constants ---
+    const __m256 zero     =  _mm256_setzero_ps();
+    const __m256 two      = *(__m256*)_ps256_2;
+    const __m256 half     = *(__m256*)_ps256_0p5;
+    const __m256 nan_val  = *(__m256*)_ps256_nan;
+    const __m256 abs_mask = *(__m256*)_ps256_inv_sign_mask;
+    const __m256 sign_mask= *(__m256*)_ps256_sign_mask;
+    const __m256 all_bits = *(__m256*)_ps256_all_bits;
+
+    // --- input a  ---
+    __m256 neg_a_mask = _mm256_cmp_ps(a, zero, _CMP_LT_OS);
+    __m256 abs_a = _mm256_and_ps(a, abs_mask);
+
+    // ---  |a|^b ---
+    __m256 result = exp256_ps(_mm256_mul_ps(b, log256_ps(abs_a)));
+
+    // --- handle a < 0 ---
+    if (_mm256_movemask_ps(neg_a_mask) != 0) {
+        __m256 b_floor = _mm256_floor_ps(b);
+        __m256 is_int_mask = _mm256_cmp_ps(b, b_floor, _CMP_EQ_OQ);
+
+        __m256 b_div_2_floor = _mm256_floor_ps(_mm256_mul_ps(b, half));
+        __m256 is_odd_mask = _mm256_cmp_ps(_mm256_mul_ps(b_div_2_floor, two), b_floor, _CMP_NEQ_UQ);
+
+        //  set to neg, a < 0 AND b is odd
+        __m256 flip_sign_mask = _mm256_and_ps(neg_a_mask, is_int_mask);
+        flip_sign_mask = _mm256_and_ps(flip_sign_mask, is_odd_mask);
+
+        // set to NaN, a < 0 AND b is not an integer
+        __m256 is_not_int_mask = _mm256_xor_ps(is_int_mask, all_bits);
+        __m256 set_nan_mask = _mm256_and_ps(neg_a_mask, is_not_int_mask);
+
+        // --- use the masks to adjust the result ---
+        // a. set to neg
+        __m256 sign_flipper = _mm256_and_ps(flip_sign_mask, sign_mask);
+        result = _mm256_xor_ps(result, sign_flipper);
+        
+        // b. set to NaN
+        result = _mm256_blendv_ps(result, nan_val, set_nan_mask);
+    }
+
+    return result;
 }
 
 static inline __m256 asin256_ps(__m256 x) {
diff --git a/ntt/include/nncase/ntt/kernels/cast.h b/ntt/include/nncase/ntt/kernels/cast.h
index cb56762154..fbc7929c77 100644
--- a/ntt/include/nncase/ntt/kernels/cast.h
+++ b/ntt/include/nncase/ntt/kernels/cast.h
@@ -18,6 +18,8 @@
 #include "../post_ops.h"
 #include "../tensor_ops.h"
 #include "../ukernels.h"
+#include <cassert>
+#include <stdio.h>
 #include "../utility.h"
 #include "nncase/ntt/shape.h"
 
@@ -27,20 +29,24 @@ template <Tensor TIn, Tensor TOut, FixedDimensions VectorizedAxes,
           template <class> class TPostOp>
 class cast_impl {
     inline static constexpr size_t rank = TIn::rank();
-
-    // FIXME: vector<bool> of x86 may fail.
+    // !! For vector<bool>, the element counts must be same as the other cast oprand.
     using InElemType = element_or_scalar_t<TIn>;
     using OutElemType = element_or_scalar_t<TOut>;
     static_assert((Vector<InElemType> && Vector<OutElemType>) ||
                       (Scalar<InElemType> && Scalar<OutElemType>),
                   "input & output must have the same type.");
     inline static constexpr auto in_ele_size =
-        sizeof(std::conditional_t<Vector<InElemType>,
+        sizeof(std::conditional_t<Vector<InElemType>,  //if vector
                                   element_or_scalar_t<InElemType>, size_t>);
     inline static constexpr auto out_ele_size =
         sizeof(std::conditional_t<Vector<OutElemType>,
                                   element_or_scalar_t<OutElemType>, size_t>);
-    inline static constexpr float scale = (float)in_ele_size / out_ele_size;
+
+    inline static constexpr bool is_bool_vector =
+        Vector<InElemType> && (  std::is_same_v<element_or_scalar_t<InElemType>, bool> ||
+                                        std::is_same_v<element_or_scalar_t<OutElemType>, bool>);
+
+    inline static constexpr float scale = is_bool_vector ? 1.0f : (float)in_ele_size / out_ele_size;
 
     inline static constexpr auto in_offset_scale = scale > 1.0f ? (size_t)scale
                                                                 : (size_t)1;
@@ -69,11 +75,18 @@ class cast_impl {
 #endif
         constexpr VectorizedAxes vectorizedAxes;
         if constexpr (scale >= 1.f) {
+            if constexpr (VectorizedAxes::rank() == 1) {
+                assert(
+                    (dim_value(input.shape()[fixed_dim_v<VectorizedAxes::at(0)>]) ==
+                     dim_value(output.shape()[fixed_dim_v<VectorizedAxes::at(0)>]) * scale)
+                    );
+            }
             ntt::apply(output.shape(), [&](auto index) {
                 auto in_index = index;
-                if constexpr (vectorizedAxes.rank() == 1)
+                if constexpr (VectorizedAxes::rank() == 1)
                     in_index[fixed_dim_v<vectorizedAxes.at(0)>] *=
                         in_offset_scale;
+                __asm__ volatile("" ::: "memory");
                 ntt::u_cast<in_offset_scale, out_offset_scale, TPostOp>(
                     &input(in_index),
                     vectorizedAxes.rank() == 1
@@ -82,11 +95,18 @@ class cast_impl {
                     &output(index), 1, 1);
             });
         } else {
+            if constexpr (VectorizedAxes::rank() == 1) {
+                assert(
+                    (float)dim_value(input.shape()[fixed_dim_v<VectorizedAxe::at(0)>]) ==
+                    (float)dim_value(output.shape()[fixed_dim_v<VectorizedAxes::at(0)>]) * scale
+                );
+            }
             ntt::apply(input.shape(), [&](auto index) {
                 auto out_index = index;
                 if constexpr (vectorizedAxes.rank() == 1)
                     out_index[fixed_dim_v<vectorizedAxes.at(0)>] *=
                         out_offset_scale;
+                __asm__ volatile("" ::: "memory");
                 ntt::u_cast<in_offset_scale, out_offset_scale, TPostOp>(
                     &input(index), 1, &output(out_index),
                     vectorizedAxes.rank() == 1
@@ -101,6 +121,7 @@ class cast_impl {
 #if 0    
     template <size_t Axis, Dimension TContiguousDims, Shape TRestDims>
     constexpr void
+    //rest_dims is the dims of the tensor to be casted
     apply(const TContiguousDims &conti_dims, const TRestDims &rest_dims,
           dynamic_shape_t<rank> &index, const TIn &input, TOut &output) {
         if (conti_dims == rest_dims.rank()) {
diff --git a/ntt/include/nncase/ntt/primitive_ops.h b/ntt/include/nncase/ntt/primitive_ops.h
index c04ac77bcc..2884006e2d 100644
--- a/ntt/include/nncase/ntt/primitive_ops.h
+++ b/ntt/include/nncase/ntt/primitive_ops.h
@@ -180,12 +180,13 @@ template <class T1, class T2> struct mul {
 
 template <class T1, class T2> struct div {
     constexpr auto operator()(const T1 &v1, const T2 &v2) const noexcept {
-        static_assert(std::is_same_v<T1, T2>, "T1 and T2 must be same type");
         return v1 / v2;
     }
 };
 
 template <class T1, class T2> struct ceil_div {
+    static_assert(std::is_integral_v<T1> && std::is_integral_v<T2>,
+                  "T1 and T2 must be integral types");
     constexpr auto operator()(const T1 &v1, const T2 &v2) const noexcept {
         return (v1 + (v2 - 1)) / v2;
     }
@@ -197,9 +198,20 @@ template <class T1, class T2> struct ceil_div {
  */
 template <class T1, class T2> struct floor_mod {
     constexpr auto operator()(const T1 &v1, const T2 &v2) const noexcept {
-        return (T1)((double)v1 - std::floor(static_cast<double>(v1) /
-                                            static_cast<double>(v2)) *
-                                     (double)v2);
+            return (T1)(double(v1) -
+                   std::floor(static_cast<double>(v1) / static_cast<double>(v2)) *
+                       static_cast<double>(v2));
+    }
+};
+
+
+template <typename T>
+requires (std::is_same_v<T, float_e4m3_t> || std::is_same_v<T, float_e5m2_t>)
+struct floor_mod<T, T> {
+    constexpr auto operator()(T v1,
+                              T v2) const noexcept {
+
+        return T(v1 - (std::floor(float(v1) / float(v2)) * v2));
     }
 };
 
@@ -220,10 +232,23 @@ template <class T1, class T2> struct outer_product {
  */
 template <class T1, class T2> struct mod {
     constexpr auto operator()(const T1 &v1, const T2 &v2) const noexcept {
-        return std::fmod((float)v1, (float)v2);
+        return (T1)std::fmod((double)v1, (double)v2);
     }
 };
 
+
+template <typename T>
+requires (std::is_same_v<T, float_e4m3_t> || std::is_same_v<T, float_e5m2_t>)
+struct mod<T, T> {
+    constexpr auto operator()(T v1,
+                              T v2) const noexcept {
+        return T(
+            std::fmod(static_cast<float>(v1), static_cast<float>(v2)));
+    }
+};
+
+
+
 template <class T1, class T2> struct min {
     constexpr auto operator()(const T1 &v1, const T2 &v2) const noexcept {
         return std::min(v1, v2);
@@ -310,7 +335,9 @@ template <class T1, class T2> struct clamp {
 
 template <class T1, class T2> struct cast {
     constexpr T2 operator()(const T1 &v) const noexcept {
+        // printf("cast from %f to %f\n", (double)(float)v, (double)static_cast<T2>(v));
         return static_cast<T2>(v);
+        
     }
 };
 
@@ -551,7 +578,8 @@ template <class T> constexpr T swish<T>::operator()(const T &v) const noexcept {
 // swishb(v) = v / (exp(-v*beta) + 1)
 template <class T, class B>
 constexpr T swishb<T, B>::operator()(const T &v, const B &beta) const noexcept {
-    return v / (ntt::exp(-v * beta) + 1);
+    //-(double)v is for uint type.
+    return static_cast<T>(double(v) / (ntt::exp((-(double)v) *(double)beta) + (double)1));
 }
 
 template <class T1, class T2, class TResult>
diff --git a/ntt/include/nncase/ntt/ukernels/u_cast.h b/ntt/include/nncase/ntt/ukernels/u_cast.h
index 9c9bdbeeaa..c835e8f527 100644
--- a/ntt/include/nncase/ntt/ukernels/u_cast.h
+++ b/ntt/include/nncase/ntt/ukernels/u_cast.h
@@ -32,8 +32,32 @@ struct u_cast {
                               size_t output_stride, size_t count) noexcept {
         using policy_t = u_cast_policy<Arch>;
         constexpr auto unroll = policy_t::unroll;
+        
+        if constexpr (in_offset_scale == 8 && out_offset_scale == 1) {
+            while (count / unroll) {
+                for (size_t i = 0; i < unroll; i++) {
+                    *output =
+                        ntt::ops::cast<T1, T2>()(*(input + 0 * input_stride), *(input + 1 * input_stride),
+                                                 *(input + 2 * input_stride), *(input + 3 * input_stride),
+                                                 *(input + 4 * input_stride), *(input + 5 * input_stride),
+                                                 *(input + 6 * input_stride), *(input + 7 * input_stride));
+                    input += input_stride * in_offset_scale;
+                    output += output_stride * out_offset_scale;
+                    count--;
+                }
+            }
 
-        if constexpr (in_offset_scale == 4 && out_offset_scale == 1) {
+            for (size_t i = 0; i < count; i++) {
+                *output = ntt::ops::cast<T1, T2>()(
+                    *(input + 0 * input_stride), *(input + 1 * input_stride),
+                    *(input + 2 * input_stride), *(input + 3 * input_stride),
+                    *(input + 4 * input_stride), *(input + 5 * input_stride),
+                    *(input + 6 * input_stride), *(input + 7 * input_stride));
+                input += input_stride * in_offset_scale;
+                output += output_stride * out_offset_scale;
+            }
+        }
+        else if constexpr (in_offset_scale == 4 && out_offset_scale == 1) {
             while (count / unroll) {
                 for (size_t i = 0; i < unroll; i++) {
                     *output =
@@ -78,7 +102,6 @@ struct u_cast {
             }
         } else if constexpr (in_offset_scale == 1 && out_offset_scale > 1) {
             using value_type = typename T2::element_type;
-            constexpr auto lanes = T2::shape();
 
             while (count / unroll) {
                 for (size_t i = 0; i < unroll; i++) {
@@ -104,6 +127,7 @@ struct u_cast {
             }
 
         } else {
+
             while (count / unroll) {
                 for (size_t i = 0; i < unroll; i++) {
                     *output = ntt::ops::cast<T1, T2>()(*input);
diff --git a/ntt/include/nncase/ntt/vector.h b/ntt/include/nncase/ntt/vector.h
index 2c01fcbf9e..4388eba6a7 100644
--- a/ntt/include/nncase/ntt/vector.h
+++ b/ntt/include/nncase/ntt/vector.h
@@ -145,4 +145,32 @@ template <Vector T> struct vector_rank<T> {
 };
 
 template <class T> constexpr inline auto vector_rank_v = vector_rank<T>::value;
+
+template <typename TShape>
+struct last_lane;
+
+template <nncase::ntt::Dimension D>
+struct last_lane<nncase::ntt::shape_t<D>> {
+    static constexpr size_t value = D::value;
+};
+
+template <nncase::ntt::Dimension D1, nncase::ntt::Dimension... Dims>
+struct last_lane<nncase::ntt::shape_t<D1, Dims...>> {
+    static constexpr size_t value = last_lane<nncase::ntt::shape_t<Dims...>>::value;
+};
+
+template <nncase::ntt::Vector TVec>
+struct get_last_lane_vector {
+    using element_type = typename TVec::element_type;
+    using shape_type = typename TVec::shape_type;
+    
+    static constexpr size_t last_dim = last_lane<shape_type>::value;
+    
+    using type = nncase::ntt::replace_lanes_t<TVec, last_dim>;
+};
+
+template<typename TVec>
+using get_last_lane_vector_t = typename get_last_lane_vector<TVec>::type;
+
+
 } // namespace nncase::ntt
diff --git a/ntt/include/nncase/ntt/vector_ops.h b/ntt/include/nncase/ntt/vector_ops.h
index de2d35c02d..5a47839eb5 100644
--- a/ntt/include/nncase/ntt/vector_ops.h
+++ b/ntt/include/nncase/ntt/vector_ops.h
@@ -64,12 +64,17 @@ struct tensor_unary_impl<Op, TVector> {
     Op<sub_vector_type> op_;
 };
 
-template <template <class T1, class T2> class Op, class T1, class T2>
+template <template <class OpTLhs, class OpTRhs> class Op, class T1, class T2>
 struct tensor_binary_impl;
 
-template <template <class T1, class T2> class Op, Vector TVector, class T2>
+//here, T1 and T2 can be scalar or vector
+//T1 1D vector, T2 scalar or 1D vector
+//T1 2D vector, T2 scalar or 1D vector
+//T1 2D vector, T2 2D vector
+template <template <class OpTLhs, class OpTRhs> class Op, Vector TVector, class T2>
+requires((!Vector<T2> || !(TVector::rank() == 1 && T2::rank() == 2)))
 struct tensor_binary_impl<Op, TVector, T2> {
-    using element_type1 = typename TVector::element_type;
+    using element_type1 = TVector::element_type;
     using element_type2 = element_or_scalar_t<T2>;
 
     constexpr TVector operator()(const TVector &v1,
@@ -77,15 +82,28 @@ struct tensor_binary_impl<Op, TVector, T2> {
         TVector value{};
         if constexpr (Vector<T2>) {
             if constexpr (TVector::rank() == 2 && T2::rank() == 1) {
-                ntt::apply(v1.shape(), [&](auto index) {
-                    value(index) = op_(v1(index), v2(index[1_dim]));
+                static_assert(TVector::shape().at(1) == T2::shape().at(0), "vector shape not match");
+                Op<get_last_lane_vector_t<TVector>, T2> op_;  //Op<2D,1D> delegate to Op<1D, 1D>
+                ntt::loop<TVector::shape().at(0)>([&](auto m) {
+                    value(m) = op_(v1(m), v2);
                 });
-            } else {
+            } else if constexpr (TVector::rank() == 1 && T2::rank() == 1) {
+                static_assert(TVector::shape().at(0) == T2::shape().at(0), "vector shape not match");
+                Op<element_type1, element_type2> op_;  //Op<1D, 1D> delegate to Op<scalar, scalar>
                 ntt::apply(v1.shape(), [&](auto index) {
-                    value(index) = op_(v1(index), v2(index));
+                    value(index) = op_(v1(index), v2(index)); 
+                });
+            } else if constexpr (TVector::rank() == 2 && T2::rank() == 2) {
+                static_assert(TVector::shape() == T2::shape(), "2D vector shape not match");
+                using vec_1D_type1 = get_last_lane_vector_t<TVector>;
+                using vec_1D_type2 = get_last_lane_vector_t<T2>;
+                Op<vec_1D_type1, vec_1D_type2> op_; //Op<2D, 2D> delegate to Op<1D, 1D>
+                ntt::loop<TVector::shape().at(0)>([&](auto m) {
+                    value(m) = op_(v1(m), v2(m));
                 });
             }
         } else {
+            Op<element_type1, element_type2> op_;  //Op<1D/2D, scalar> delegate to Op<scalar, scalar>
             ntt::apply(v1.shape(),
                        [&](auto index) { value(index) = op_(v1(index), v2); });
         }
@@ -93,38 +111,23 @@ struct tensor_binary_impl<Op, TVector, T2> {
         return value;
     }
 
-  private:
-    Op<element_type1, element_type2> op_;
+//   private:
+//     Op<element_type1, element_type2> op_;
 };
 
-template <template <class T1, class T2> class Op, Vector T1, Vector T2>
-    requires(T1::rank() == 2 && T2::rank() == 2)
-struct tensor_binary_impl<Op, T1, T2> {
-    using sub_vector_type =
-        vector<typename T1::element_type, T1::shape().at(1)>;
-
-    constexpr T1 operator()(const T1 &v1, const T2 &v2) const noexcept {
-        T1 value;
-        for (size_t m = 0; m < T1::shape().at(0); m++) {
-            value(m) = op_(v1(m), v2(m));
-        }
-        return value;
-    }
-
-  private:
-    Op<sub_vector_type, sub_vector_type> op_;
-};
 
+//T1 scalar, T2 1D vector or 2D vector
 template <template <class T1, class T2> class Op, Scalar TScalar,
           Vector TVector>
 struct tensor_binary_impl<Op, TScalar, TVector> {
-    using element_type2 = typename TVector::element_type;
+    using element_type2 = TVector::element_type;
 
     constexpr TVector operator()(const TScalar &v1,
                                  const TVector &v2) const noexcept {
-        TVector value;
-        ntt::apply(v2.shape(),
-                   [&](auto index) { value(index) = op_(v1, v2(index)); });
+        TVector value{};
+        ntt::apply(v2.shape(), [&](auto index) {
+            value(index) = (op_(v1, v2(index)));
+        });
         return value;
     }
 
@@ -132,13 +135,34 @@ struct tensor_binary_impl<Op, TScalar, TVector> {
     Op<TScalar, element_type2> op_;
 };
 
+//T1 1D vector, T2 2D vector
+template <template <class T1, class T2> class Op, Vector TVec1,
+          Vector TVec2>
+    requires(TVec1::rank() == 1 && TVec2::rank() == 2)
+struct tensor_binary_impl<Op, TVec1, TVec2> {
+    using element_type1 = TVec1::element_type;
+    using element_type2 = TVec2::element_type;
+    using vec_1D_type2 = get_last_lane_vector_t<TVec2>;
+    constexpr TVec2 operator()(const TVec1 &v1, const TVec2 &v2) const noexcept {
+        TVec2 value{};
+        static_assert(TVec1::shape().at(0) == TVec2::shape().at(1), "vector shape not match");
+        ntt::loop<TVec2::shape().at(0)>([&](auto m) {
+            value(m) = op_(v1, v2(m));
+        });
+        return value;
+    }
+
+  private:
+    Op<TVec1, vec_1D_type2> op_;
+};
+
 // compare tensor impl
 template <template <class T1, class T2> class Op, class T1, class T2>
 struct tensor_compare_impl;
 
 template <template <class T1, class T2> class Op, Vector TVector, class T2>
 struct tensor_compare_impl<Op, TVector, T2> {
-    using element_type1 = typename TVector::element_type;
+    using element_type1 =  typename TVector::element_type;
     using element_type2 = element_or_scalar_t<T2>;
     static constexpr size_t vl = TVector::template lane<0>();
     using TOut = ntt::vector<bool, vl>;
@@ -189,7 +213,7 @@ struct tensor_compare_impl<Op, T1, T2> {
 template <template <class T1, class T2> class Op, Scalar TScalar,
           Vector TVector>
 struct tensor_compare_impl<Op, TScalar, TVector> {
-    using element_type2 = typename TVector::element_type;
+    using element_type2 = TVector::element_type;
     static constexpr size_t vl = TVector::template lane<0>();
     using TOut = ntt::vector<bool, vl>;
     constexpr TOut operator()(const TScalar &v1,
@@ -264,11 +288,13 @@ NTT_DEFINE_TENSOR_COMPARE_IMPL(greater_or_equal);
 NTT_DEFINE_TENSOR_COMPARE_IMPL(less);
 NTT_DEFINE_TENSOR_COMPARE_IMPL(less_or_equal);
 
+//assert TVec1 == TVec2 == 1D vector
 template <Vector TVector> struct inner_product<TVector, TVector> {
     using element_type = typename TVector::element_type;
 
     constexpr auto operator()(const TVector &v1,
                               const TVector &v2) const noexcept {
+        //datatype infer: op_<vector, vector> delegate to op_<scalar, scalar>
         using result_type = decltype(op_(std::declval<element_type>(),
                                          std::declval<element_type>()));
         result_type value{};
@@ -281,6 +307,30 @@ template <Vector TVector> struct inner_product<TVector, TVector> {
     ops::inner_product<element_type, element_type> op_;
 };
 
+template <Vector TVector> 
+requires (std::is_same_v<typename TVector::element_type, float_e4m3_t> || std::is_same_v<typename TVector::element_type, float_e5m2_t>
+            || std::is_same_v<typename TVector::element_type, half>)
+struct inner_product<TVector, TVector> {
+    //ulp is too large for fp8
+    //intermediate result should be float
+
+    using element_type = typename TVector::element_type;
+
+    constexpr auto operator()(const TVector &v1,
+                              const TVector &v2) const noexcept {
+        //datatype infer: op_<vector, vector> delegate to op_<scalar, scalar>
+        using result_type = float;
+        result_type value{};
+        ntt::apply(v1.shape(),
+                   [&](auto index) { value += op_float_(float(v1(index)), float(v2(index))); });
+        return element_type(value);
+    }
+
+  private:
+    ops::inner_product<element_type, element_type> op_;
+    ops::inner_product<float, float> op_float_ = ops::inner_product<float, float>();
+};
+
 template <Vector TVector1, Vector TVector2>
 struct outer_product<TVector1, TVector2> {
     using element_type = typename TVector1::element_type;
@@ -491,7 +541,7 @@ template <Vector TVector1, Vector TVector2> struct cast<TVector1, TVector2> {
     using from_type = typename TVector1::element_type;
     using to_type = typename TVector2::element_type;
     constexpr auto operator()(const TVector1 &v) const noexcept {
-        TVector2 value;
+        TVector2 value{};
         ntt::apply(v.shape(),
                    [&](auto index) { value(index) = op_(v(index)); });
         return value;
@@ -502,8 +552,7 @@ template <Vector TVector1, Vector TVector2> struct cast<TVector1, TVector2> {
         requires(sizeof...(tensors) > 1)
     {
         static_assert((... && (std::decay_t<TVectors>::rank() == 1)));
-
-        TVector2 value;
+        TVector2 value{};
         size_t count = 0;
 
         auto process_tensor = [&](const auto &tensor) {
@@ -518,6 +567,8 @@ template <Vector TVector1, Vector TVector2> struct cast<TVector1, TVector2> {
     }
 
     constexpr auto operator()(const TVector1 &v) const noexcept
+    //size means the number of elements
+    //assert(TVector1::size() = n * TVector2::size())
         requires(Vector<TVector1> && (TVector1::size() != TVector2::size()))
     {
 
diff --git a/ntt/test/ctest/CMakeLists.txt b/ntt/test/ctest/CMakeLists.txt
index 928644cc09..1a6e9a7a99 100644
--- a/ntt/test/ctest/CMakeLists.txt
+++ b/ntt/test/ctest/CMakeLists.txt
@@ -13,12 +13,22 @@ find_package(Python3 REQUIRED)
 
 include_directories(${CMAKE_CURRENT_LIST_DIR}/..)
 
+# add_definitions(-DDE_BUG)
+
 # --- Generate test source files ---
-# Define script paths and output file paths in the source directory
-set(GENERATE_PACK_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/generate_pack_tests.py)
-set(GENERATED_PACK_CMAKE ${CMAKE_CURRENT_SOURCE_DIR}/generated_pack_tests.cmake)
-set(GENERATE_UNPACK_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/generate_unpack_tests.py)
-set(GENERATED_UNPACK_CMAKE ${CMAKE_CURRENT_SOURCE_DIR}/generated_unpack_tests.cmake)
+# Define kernel names for automatic test generation. Add more kernels here.
+set(KERNEL_NAMES binary pack unpack cast)
+# set(KERNEL_NAMES binary pack unpack)
+# set(KERNEL_NAMES  pack unpack)
+# set(KERNEL_NAMES binary cast)
+# set(KERNEL_NAMES cast)
+
+# Set directories
+set(TEST_GENERATOR_DIR ${CMAKE_CURRENT_SOURCE_DIR}/test_generator)
+set(GENERATED_DIR ${CMAKE_CURRENT_SOURCE_DIR}/generated)
+
+# Create generated directory if it doesn't exist
+file(MAKE_DIRECTORY ${GENERATED_DIR})
 
 # Macro to run a generator script only when the output is missing or outdated
 macro(run_generator_if_needed script_path output_file)
@@ -37,26 +47,50 @@ macro(run_generator_if_needed script_path output_file)
     endif()
 endmacro()
 
-# Run the generators
-run_generator_if_needed(${GENERATE_PACK_SCRIPT} ${GENERATED_PACK_CMAKE})
-run_generator_if_needed(${GENERATE_UNPACK_SCRIPT} ${GENERATED_UNPACK_CMAKE})
-
-# Include the generated cmake files which define test source variables
-include(${GENERATED_PACK_CMAKE})
-include(${GENERATED_UNPACK_CMAKE})
-
+# Automatically process all kernel generators
+foreach(kernel ${KERNEL_NAMES})
+    string(TOUPPER ${kernel} KERNEL_UPPER)
+    
+    set(GENERATOR_SCRIPT ${TEST_GENERATOR_DIR}/generate_${kernel}_tests.py)
+    set(GENERATED_CMAKE ${GENERATED_DIR}/generated_${kernel}_tests.cmake)
+    
+    if(EXISTS ${GENERATOR_SCRIPT})
+        run_generator_if_needed(${GENERATOR_SCRIPT} ${GENERATED_CMAKE})
+        
+        if(EXISTS ${GENERATED_CMAKE})
+            include(${GENERATED_CMAKE})
+            list(APPEND GENERATED_TEST_SOURCES ${GENERATED_${KERNEL_UPPER}_TEST_SOURCES})
+        endif()
+    else()
+        message(WARNING "Generator script for ${kernel} not found: ${GENERATOR_SCRIPT}")
+    endif()
+endforeach()
 
-macro(add_test_exec name)
-    add_executable(${name} ${name}.cpp)
-    target_link_libraries(${name} PRIVATE GTest::gtest_main nncaseruntime ortki::ortki)
-    add_test(NAME ${name} COMMAND ${CMAKE_COMMAND} -DTEST_EXECUTABLE=$<TARGET_FILE:${name}> -P ${NNCASE_CMAKE_DIR}/run_test.cmake)
+macro(add_test_exec test_source_file)
+    get_filename_component(tname ${test_source_file} NAME_WE)
+    add_executable(${tname} ${test_source_file})
+    target_compile_options(${tname} PRIVATE )
+    target_link_libraries(${tname} PRIVATE GTest::gtest_main nncaseruntime ortki::ortki)
+    add_test(NAME ${tname} COMMAND ${CMAKE_COMMAND} -DTEST_EXECUTABLE=$<TARGET_FILE:${tname}> -P ${NNCASE_CMAKE_DIR}/run_test.cmake)
 endmacro()
 
 file(GLOB HANDWRITTEN_TESTS CONFIGURE_DEPENDS
-
+    # generated/test_ntt_binary_generated_add_Uint16.cpp
+    test_ntt_playground.cpp
+    # generated/test_ntt_binary_float16_pow_generated.cpp
+    # generated/test_ntt_binary_float32_floor_mod_generated.cpp
+    # generated/test_ntt_binary_float32_floor_mod_generated.cpp
+    # generated/test_ntt_binary_float16_floor_mod_generated.cpp
+    # generated/test_ntt_binary_float64_floor_mod_generated.cpp
+    # generated/test_ntt_binary_int16_floor_mod_generated.cpp
+    # generated/test_ntt_binary_float32_sub_generated.cpp
+    # generated/test_ntt_cast_from_bfloat16_generated.cpp
+    # generated/test_ntt_binary_uint8_sub_generated.cpp
+    # test_ntt_pack_generated_Uint64
     # test_ntt_binary_mul.cpp
     # test_ntt_binary_sub.cpp
     # test_ntt_cast.cpp
+    # test_ntt_unary_abs.cpp
     # test_ntt_clamp.cpp
     # test_ntt_compare_equal.cpp
     # test_ntt_compare_greater_or_equal.cpp
@@ -66,23 +100,23 @@ file(GLOB HANDWRITTEN_TESTS CONFIGURE_DEPENDS
     # test_ntt_expand.cpp
     # test_ntt_gather.cpp
     # test_ntt_layer_norm.cpp
-    test_ntt_matmul.cpp
-    # test_ntt_pack_generated_Float8e4m3.cpp
+    # test_ntt_matmul.cpp
     # test_ntt_reduce.cpp
     # test_ntt_rms_norm.cpp
     # test_ntt_scatter_nd.cpp
     # test_ntt_slice.cpp
     # test_ntt_softmax.cpp
     # test_ntt_transpose.cpp
-    # test_ntt_transpose_half.cpp
     # test_ntt_unpack.cpp
     # test_ntt_where.cpp
+    # test_ntt_cast_from_float32_generated.cpp
 )
 
-# list(APPEND TEST_NAMES ${HANDWRITTEN_TESTS} ${GENERATED_PACK_TEST_SOURCES} ${GENERATED_UNPACK_TEST_SOURCES})
-
+# Combine handwritten and generated tests
+list(APPEND TEST_NAMES ${HANDWRITTEN_TESTS} ${GENERATED_TEST_SOURCES})
+# list(APPEND TEST_NAMES ${HANDWRITTEN_TESTS} )
+# list(APPEND TEST_NAMES ${GENERATED_TEST_SOURCES} )
 
-foreach(test_name ${TEST_NAMES})
-    get_filename_component(tname ${test_name} NAME_WE)
-    add_test_exec(${tname})
+foreach(test_file ${TEST_NAMES})
+    add_test_exec(${test_file})
 endforeach()
diff --git a/ntt/test/ctest/test_generator/generate_binary_tests.py b/ntt/test/ctest/test_generator/generate_binary_tests.py
new file mode 100644
index 0000000000..1a66b2e427
--- /dev/null
+++ b/ntt/test/ctest/test_generator/generate_binary_tests.py
@@ -0,0 +1,986 @@
+#test case combination:
+# 1. lhs/rhs
+# 2            # "div": f"auto ort_output = ortki_Div(ort_input_lhs, ort_input_rhs);",
+# 3. lhs broadcast to rhs, rhs broadcast to lhs
+# 3.1. 1 dim broadcast
+# 3.2. 2 dims broadcast
+# 4. scalar/vector/2d vector
+# 5. tensor/ view
+
+import itertools
+import os
+from typing import List
+from test_generator_base import *
+
+
+
+class BinaryTestGenerator(BaseTestGenerator):
+    def __init__(self):
+        super().__init__()
+        
+        # ORT binary operations do not support these data types, need to cast to double 
+        # fortunately, they could be cast in ort( fp8 are unfortunate)
+        self.types_need_cast_in_ort = {
+            "swishb": [ 'bool', 'uint8_t', 'uint16_t', 'uint32_t',
+                'uint64_t', 'int8_t', 'int16_t', 'bfloat16', 'half',
+                 'float_e4m3_t', 'float_e5m2_t', "int32_t", "int64_t"],
+
+            "default": [ 'bool',  'int8_t', 'int16_t', 'bfloat16', 'half',
+                    'uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'
+                ]
+        }
+        self.types_need_cast_in_ntt = {
+            'float_e4m3_t', 'float_e5m2_t' 
+        }
+
+        self.dims_specs_options = {
+            "swishb":   [
+                # Scalar broadcast
+                ([2, 3, 16, 16], [1])
+            ],
+            
+            "default": [
+                # No broadcast
+                ([2, 3, 16, 16], [2, 3, 16, 16]),
+                # Scalar broadcast
+                ([1], [2, 3, 16, 16]),
+                ([2, 3, 16, 16], [1]),
+                # Vector broadcast
+                ([16], [2, 3, 16, 16]),
+                ([2, 3, 16, 16], [16]),
+                # Multidirectional broadcast
+                ([2, 1, 16, 1], [1, 3, 1, 16]),
+            ]
+        }
+        
+        
+        # Define power operand ranges
+        self.ALL_POW_OPRANDS = {
+            "uint8_t": {"lhs_min": "0", "lhs_max": "3", "rhs_min": "0", "rhs_max": "3"},
+            "int8_t": {"lhs_min": "-2", "lhs_max": "2", "rhs_min": "-3", "rhs_max": "3"},
+            "int16_t": {"lhs_min": "-7", "lhs_max": "8", "rhs_min": "-4", "rhs_max": "4"},
+            "uint16_t": {"lhs_min": "0", "lhs_max": "8", "rhs_min": "0", "rhs_max": "4"},
+            "int32_t": {"lhs_min": "-15", "lhs_max": "15", "rhs_min": "-7", "rhs_max": "7"},
+            "uint32_t": {"lhs_min": "0", "lhs_max": "15", "rhs_min": "0", "rhs_max": "7"},
+            "int64_t": {"lhs_min": "0", "lhs_max": "15", "rhs_min": "-14", "rhs_max": "14"},
+            "uint64_t": {"lhs_min": "0", "lhs_max": "15", "rhs_min": "0", "rhs_max": "14"},
+
+
+            "float_e4m3_t": {"lhs_min": "float_e4m3_t(-3.0)", "lhs_max": "float_e4m3_t(2.0)", "rhs_min": "float_e4m3_t(-2.0f)", "rhs_max": "float_e4m3_t(3.0f)"},
+            "float_e5m2_t": {"lhs_min": "float_e5m2_t(-3.0)", "lhs_max": "float_e5m2_t(3.0)", "rhs_min": "float_e5m2_t(-3.0f)", "rhs_max": "float_e5m2_t(3.0f)"},
+            "bfloat16": {"lhs_min": "bfloat16(-64.0)", "lhs_max": "bfloat16(64.0)", "rhs_min": "-10.0_bf16", "rhs_max": "10.0_bf16"},
+            "half": {"lhs_min": "half(-32.0)", "lhs_max": "half(32.0)", "rhs_min": "half(-3.0)", "rhs_max": "half(3.0)"},
+            "float": {"lhs_min": "-256.0", "lhs_max": "256.0", "rhs_min": "-15.0", "rhs_max": "15.0"},
+            "double": {"lhs_min": "-1000.0", "lhs_max": "1000.0", "rhs_min": "-50.0", "rhs_max": "50.0"},
+
+        }
+
+        self.simple_continuities = [
+            Continuity(is_contiguous=True, non_contiguous_dim=None, big_tensor_op=None),
+            # Continuity(is_contiguous=False, non_contiguous_dim=1, big_tensor_op="*2"),
+            Continuity(is_contiguous=False, non_contiguous_dim=2, big_tensor_op="+3"),
+        ]
+
+        self.integer_types = ['int8_t', 'int16_t', 'int32_t', 'int64_t', 'uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'] 
+        
+        self.ort_custom_function = {
+            "ceil_div": self._generate_ort_ceil_div_function,
+            "swishb": self._generate_ort_SwishB,
+            "inner_product": self._generate_inner_product_operation,
+        }
+        
+        self.op_str_map = {
+            "add": f"auto ort_output = ortki_Add(ort_input_lhs, ort_input_rhs);",
+            "sub": f"auto ort_output = ortki_Sub(ort_input_lhs, ort_input_rhs);",
+            "mul": f"auto ort_output = ortki_Mul(ort_input_lhs, ort_input_rhs);",
+            "div": f"auto ort_output = ortki_Div(ort_input_lhs, ort_input_rhs);",
+            "ceil_div": "auto ort_output = ort_ceil_div(ort_input_lhs, ort_input_rhs);",
+            "floor_mod": lambda datatype: \
+                "auto ort_output = ortki_Mod(ort_input_lhs, ort_input_rhs, 0);" \
+                if datatype.cpp_type in self.integer_types and datatype.cpp_type not in self.types_need_cast_in_ort["default"] \
+                else "auto ort_output = ortki_Sub(ort_input_lhs, ortki_Mul(ortki_Floor(ortki_Div(ort_input_lhs, ort_input_rhs)), ort_input_rhs));",
+            "mod": f"auto ort_output = ortki_Mod(ort_input_lhs, ort_input_rhs, 1);",
+            "min":  self._generate_minmax_operation("ortki_Min"),
+            "max":  self._generate_minmax_operation("ortki_Max"),
+            # "pow": f"auto ort_output = ortki_Pow(ort_input_lhs, ort_input_rhs);",
+            "swishb":  f"auto ort_output = ortki_SwishB(ort_input_lhs, ort_input_rhs);",
+            "inner_product":  \
+                            "static bool element_is_vec = ntt::Vector<typename decltype(ntt_input_lhs)::element_type>;\n" \
+                            "   auto ort_output = ortki_inner_product(ort_input_lhs, ort_input_rhs, element_is_vec); " ,
+            "outer_product":  \
+                            "   auto ort_output =ortki_Mul(ort_input_lhs, ort_input_rhs); " 
+        }
+
+    def _generate_minmax_operation(self, operation_func):
+        """Generate code for min/max operations with reduced duplication"""
+        return (
+            "const size_t num_inputs = 2;\n"
+            "    ortki::OrtKITensor* input_tensors[num_inputs];\n"
+            "    input_tensors[0] = ort_input_lhs;\n"
+            "    input_tensors[1] = ort_input_rhs;\n"
+            f"    auto ort_output = {operation_func}(input_tensors, num_inputs);"
+        )
+
+    def _generate_ceil_div_operation(self, datatype):
+        """Generate code for ceil_div operation with reduced duplication"""
+        # Determine the appropriate type and value for neg1
+        types_to_cast = self.types_need_cast_in_ort.get(op_str, self.types_need_cast_in_ort["default"])
+        if datatype.cpp_type == "int64_t":
+            var_type = "int64_t"
+            value_str = "-1"
+        elif datatype.cpp_type not in types_to_cast: 
+            # Now only int32_t in this case
+            var_type = "int32_t"
+            value_str = "-1"
+        else:
+            var_type = "double"
+            value_str = "-1.0f"
+        
+        # Return the common template with variable substitution
+        return (
+            f"auto ntt_neg1 = make_tensor<{var_type}>(ntt::fixed_shape_v<1>);\n"
+            f"    ntt_neg1(0) = {value_str};\n"
+            "    auto ort_neg1 = NttTest::ntt2ort(ntt_neg1);\n"
+            "    auto ort_output = ortki_Div(ortki_Add(ort_input_lhs, ortki_Add(ort_input_rhs, ort_neg1)), ort_input_rhs);"
+        )
+
+    def _generate_ort_const_var_info(self, datatype, const_value, op_str):
+        """Generate variable type and value string for ORT constants"""
+        # !!! Very ugly, must be refactored later
+        types_to_cast = self.types_need_cast_in_ort.get(op_str, self.types_need_cast_in_ort["default"])
+        if not "int" in datatype.cpp_type: # float
+            if datatype.cpp_type in types_to_cast:
+                var_type = "double"
+                value_str = f"{const_value}.0f"
+            else:
+                var_type = datatype.cpp_type
+                value_str = f"static_cast<{datatype.cpp_type}>({const_value})"
+        else: # uintx, intx
+            if(op_str == "pow"):
+            # Ortki can not take int as exp input
+                var_type = "double"
+                value_str = f"{const_value}.0f"
+            else:
+                if( datatype.cpp_type in types_to_cast):
+                    var_type = "double"
+                    value_str = f"{const_value}.0f"
+                else:
+                    var_type = datatype.cpp_type
+                    value_str = f"static_cast<{datatype.cpp_type}>({const_value})"
+        return var_type, value_str
+
+    def _generate_ort_ceil_div_function(self, datatype):
+        """Generate the ort_ceil_div function definition"""
+        const_var_type, const_value_str = self._generate_ort_const_var_info(datatype, -1, "ceil_div")
+        
+        return (
+            f"static ortki::OrtKITensor* ort_ceil_div(ortki::OrtKITensor* ort_input_lhs, ortki::OrtKITensor* ort_input_rhs) {{\n"
+            f"    auto ntt_neg1 = make_tensor<{const_var_type}>(ntt::fixed_shape_v<1>);\n"
+            f"    ntt_neg1(0) = {const_value_str};\n"
+            "    auto ort_neg1 = NttTest::ntt2ort(ntt_neg1);\n"
+            "    return ortki_Div(ortki_Add(ort_input_lhs, ortki_Add(ort_input_rhs, ort_neg1)), ort_input_rhs);\n"
+            "}\n\n"
+        )
+    def _generate_inner_product_operation(self, datatype):
+        """Generate the ortki_inner_product function definition"""
+        return (
+        "static ortki::OrtKITensor* ortki_inner_product(ortki::OrtKITensor* ort_input_lhs, ortki::OrtKITensor* ort_input_rhs, bool  element_is_vec) {\n"
+        "   ortki::OrtKITensor* product_tensor = ortki_Mul(ort_input_lhs, ort_input_rhs);\n"
+        "   if (!element_is_vec)\n"
+        "       return product_tensor;\n"
+        "   int64_t axis_data[] = {-1};                         \n"
+        "   const int64_t axis_shape[] = {1};                   \n"
+        "   size_t axis_rank = 1;                               \n"
+        "   auto ort_type = nncase::NttTest::primitive_type2ort_type<int64_t>();\n"
+        "   ortki::OrtKITensor* axes_tensor = make_tensor(\n"
+        "       axis_data,                                       // void* buffer\n"
+        "       ort_type,\n"
+        "       axis_shape,                                      // const int64_t* shape\n"
+        "       axis_rank                                        // rank\n"
+        "   );\n"
+        "   if (axes_tensor == nullptr) {\n"
+        "       return nullptr;\n"
+        "   }\n"
+        "   int64_t keepdims = 0;\n"
+        "   int64_t noop_with_empty_axes = 0;\n"
+        "   ortki::OrtKITensor* result_tensor = ortki_ReduceSum(\n"
+        "       product_tensor,\n"
+        "       axes_tensor,\n"
+        "       keepdims,\n"
+        "       noop_with_empty_axes);\n"
+        "   return result_tensor;\n"
+        "}"
+        )
+
+    def _generate_ort_SwishB(self, datatype):
+        """Generate the ortki_SwishB function definition"""
+        const_var_type, const_value_str = self._generate_ort_const_var_info(datatype, 1, "swishb")
+        
+        return (
+            f"static ortki::OrtKITensor* ortki_SwishB(ortki::OrtKITensor* ort_input, ortki::OrtKITensor* beta_tensor) {{\n"
+            f"    auto ntt_1_tensor = make_tensor<{const_var_type}>(ntt::fixed_shape_v<1>);\n"
+            f"    ntt_1_tensor(0) = {const_value_str};\n"
+            "    auto ort_1 = NttTest::ntt2ort(ntt_1_tensor);\n"           
+            "    auto ort_neg = ortki_Neg(ort_input);\n"
+            "    auto ort_mul = ortki_Mul(ort_neg, beta_tensor);\n"
+            "    auto ort_exp = ortki_Exp(ort_mul);\n"
+            "    auto ort_add = ortki_Add(ort_1, ort_exp);\n"
+            "    return ortki_Div(ort_input, ort_add);\n"
+            "}\n\n"
+        )
+
+    def generate_ort_custom_op(self, datatype, custom_op_name):
+        """Generate custom ORT operation functions"""
+        if custom_op_name in self.ort_custom_function:
+            return self.ort_custom_function[custom_op_name](datatype)
+        return ""
+
+    def _generate_aligned_ntt_scalar_input(self, ntt_op_str, datatype, input_var_name, continuity_var_name, 
+                                   is_dynamic_shape, dims_spec, vector_rank, other_vector_rank):
+        """Generate aligned NTT input tensors for fp8 operations"""
+        """ normal case: tensor<vector<P>, axbxc> -> tensor<scalar, axbxcxP>
+            aligned case: tensor<vector<P>, axbxc> align with tensor of 2D vector->  tensor<scalar, axbxcx1xP>
+        """
+        code = []
+        aligned_dims = None
+        
+        # Determine if tensors need alignment based on vector ranks
+        need_alignment = vector_rank + other_vector_rank > 0
+
+        #fistly unsqueeze
+        
+        unsqueeze_dims = ""
+        unpack_dims = ""
+        if vector_rank >= other_vector_rank :
+            # no 1s be appended
+
+            if vector_rank == 0:
+                code.append(f"// for tensors pair that are all tensor of scalar")
+                code.append(f"auto {input_var_name}_unsqueezed = {continuity_var_name};")
+                aligned_dims = dims_spec
+            else:
+                if vector_rank == 1:
+                # because vector_rank < other_vector_rank:
+                    if ntt_op_str == "outer_product":
+                        unsqueeze_dims = f"{len(dims_spec)}, {len(dims_spec)+1}"
+                        if "lhs" in input_var_name:
+                            unpack_dims = f"{len(dims_spec)}"
+                            aligned_dims = [str(d) for d in dims_spec] + ["P", "1"]
+                        elif "rhs" in input_var_name:
+                            unpack_dims = f"{len(dims_spec)+1}"
+                            aligned_dims = [str(d) for d in dims_spec] +["1", "P"]
+                    else:
+                        unsqueeze_dims = f"{len(dims_spec)}"
+                        aligned_dims = [str(d) for d in dims_spec] + ["P"]
+                        unpack_dims = unsqueeze_dims
+
+
+                elif vector_rank == 2:
+                    unsqueeze_dims = f"{len(dims_spec)}, {len(dims_spec)+1}"
+                    aligned_dims = [str(d) for d in dims_spec] + ["4" ,"P"]
+                    unpack_dims = unsqueeze_dims
+                code.append(f"auto {input_var_name}_unsqueezed = ({continuity_var_name}).unsqueeze(fixed_shape_v<{unsqueeze_dims}>);")
+        else:
+            # vector_rank would be 0 or 1
+            diff_rank = other_vector_rank - vector_rank
+            if other_vector_rank == 1:
+                # this vector rank must be 0
+                unsqueeze_dims = f"{len(dims_spec)}"
+                unpack_dims = unsqueeze_dims    
+            else: # other_vector_rank == 2
+                # this vector rank should be 1 or 0
+                unsqueeze_dims = f"{len(dims_spec)}, {len(dims_spec)+1}"
+                unpack_dims = f"{len(dims_spec) + 1}"
+            code.append(f"auto {input_var_name}_unsqueezed = ({continuity_var_name}).unsqueeze(fixed_shape_v<{unsqueeze_dims}>);")
+            aligned_dims = [str(d) for d in dims_spec] + ["1"] * (other_vector_rank-1)
+            aligned_dims.append("P" if vector_rank == 1 else "1")
+
+        if(vector_rank == 0 ):
+            code.append(f"auto {input_var_name}_aligned = ({input_var_name}_unsqueezed).view();")
+        else:
+            code.append(f"auto {input_var_name}_aligned = ntt::make_tensor<{datatype.cpp_type}>(fixed_shape_v<{','.join(map(str, aligned_dims))}>);")
+            code.append(f"ntt::unpack({input_var_name}_unsqueezed, {input_var_name}_aligned, fixed_shape_v<{unpack_dims}>);")
+        return code, aligned_dims
+
+    def _generate_cast_golden_output(self, datatype, 
+                                   lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+                                   lhs_dims_spec, rhs_dims_spec,
+                                   lhs_vector_rank, rhs_vector_rank,
+                                   lhs_continuity, rhs_continuity,
+                                   lhs_vec_param, rhs_vec_param,
+                                   ntt_op_str):
+        """Special handling for types that cannot be cast in ORT"""
+        code = []
+        
+        # Prepare contiguous inputs
+        lhs_continuity_var_name, lhs_copy_code = self.prepare_contiguous_input(
+            "ntt_input_lhs", datatype, lhs_vector_rank, lhs_vec_param,
+            lhs_is_dynamic_shape, lhs_dims_spec, lhs_continuity
+        )
+        code.extend(lhs_copy_code)
+        
+        rhs_continuity_var_name, rhs_copy_code = self.prepare_contiguous_input(
+            "ntt_input_rhs", datatype, rhs_vector_rank, rhs_vec_param,
+            rhs_is_dynamic_shape, rhs_dims_spec, rhs_continuity
+        )
+        code.extend(rhs_copy_code)
+        
+        code.append("// align in NTT, then cast to double, then process in ORT")
+        
+        # 1.1 get ntt_input_lhs_aligned_{cpp.type}_scalar, ntt_input_rhs_aligned
+        # Determine if tensors need alignment based on vector ranks
+        need_alignment = (lhs_vector_rank + rhs_vector_rank != 0 )
+        # Initialize aligned dimensions to default values
+        lhs_aligned_dims = lhs_dims_spec
+        rhs_aligned_dims = rhs_dims_spec
+        
+        if need_alignment:
+            # 1.1.a for tensors pair that one of which is tensor of vector
+            # Generate aligned lhs input
+            lhs_code, lhs_aligned_dims = self._generate_aligned_ntt_scalar_input(
+                ntt_op_str, datatype, "ntt_input_lhs", lhs_continuity_var_name, lhs_is_dynamic_shape, 
+                lhs_dims_spec, lhs_vector_rank, rhs_vector_rank)
+            code.extend(lhs_code)
+                
+            # Generate aligned rhs input
+            rhs_code, rhs_aligned_dims = self._generate_aligned_ntt_scalar_input(
+                ntt_op_str, datatype, "ntt_input_rhs", rhs_continuity_var_name, rhs_is_dynamic_shape,
+                rhs_dims_spec, rhs_vector_rank, lhs_vector_rank)
+            code.extend(rhs_code)
+        else:
+        # 1.1.b for tensors pair that are all tensor of scalar
+            code.append("// 1.1.b for tensors pair that are all tensor of scalar")
+            code.append(f"auto ntt_input_lhs_aligned = ({lhs_continuity_var_name}).view();")
+            code.append(f"auto ntt_input_rhs_aligned = ({rhs_continuity_var_name}).view();")
+        
+        # 1.2 get ntt_lhs/rhs_double
+        lhs_double_shape_expr = self.generate_shape_init(lhs_is_dynamic_shape, lhs_aligned_dims)
+        rhs_double_shape_expr = self.generate_shape_init(rhs_is_dynamic_shape, rhs_aligned_dims)
+        
+        code.append(f"// 1.2 get ntt_lhs/rhs_double")
+        code.append(f"auto ntt_lhs_double = ntt::make_tensor<double>({lhs_double_shape_expr});")
+        code.append(f"auto ntt_rhs_double = ntt::make_tensor<double>({rhs_double_shape_expr});")
+        code.append("")
+        code.append("ntt::cast(ntt_input_lhs_aligned, ntt_lhs_double);")
+        code.append("ntt::cast(ntt_input_rhs_aligned, ntt_rhs_double);")
+        
+        # 2. calculated ort_output
+        code.append("")
+        code.append("// 2. calculated ort_output")
+        code.append(f"auto [ort_input_lhs, ort_input_rhs] = NttTest::convert_and_align_to_ort(ntt_lhs_double, ntt_rhs_double, false, false);")
+        code.extend(self.generate_ort_output(datatype, ntt_op_str))
+        code.append(f"auto ort_golden_double = ort_output;")
+        
+        # 3. transform ort_golden_double to ntt_golden{cpp_type}_scalar
+        code.append(f"// 3. transform ort_golden_double to ntt_golden{datatype.cpp_type}_scalar")
+
+        # Calculate output shape for scalar tensor
+        output_is_dynamic_shape, output_dims_spec = self.get_binary_output_shape(
+            lhs_is_dynamic_shape, rhs_is_dynamic_shape, lhs_dims_spec, rhs_dims_spec)
+        output_vector_rank = self._get_output_vector_rank(ntt_op_str, lhs_vector_rank, rhs_vector_rank)
+        
+        # # Get shape of ntt_golden_double_scalar based on aligned shapes and operation
+
+        if output_vector_rank > 0:
+            if output_vector_rank == 1:
+                golden_scalar_dims = output_dims_spec + ["P"]
+            else:  # 2D vector
+                if ntt_op_str == "outer_product":
+                    golden_scalar_dims = output_dims_spec  + ["P", "P"]
+                else:
+                    golden_scalar_dims = output_dims_spec + ["4", "P"]
+        else:
+            golden_scalar_dims = output_dims_spec
+            
+        golden_scalar_shape_expr = self.generate_shape_init(output_is_dynamic_shape, golden_scalar_dims)
+
+        code.append(f"auto ntt_golden_double_scalar = ntt::make_unique_tensor<double>({golden_scalar_shape_expr});")
+        code.append("NttTest::ort2ntt(ort_golden_double, *ntt_golden_double_scalar);")
+        code.append("")
+        code.append(f"auto ntt_golden_{datatype.cpp_type}_scalar = ntt::make_unique_tensor<{datatype.cpp_type}>({golden_scalar_shape_expr});")
+        code.append(f"ntt::cast(*ntt_golden_double_scalar, *ntt_golden_{datatype.cpp_type}_scalar);")
+
+        # 4. transform ntt_golden_{datatype.cpp_type}_scalar to ntt_golden
+        code.append("")
+        if output_vector_rank > 0:
+            # 4.b if ntt_output is tensor of vector
+            code.append("// 4.b if ntt_output is tensor of vector")
+            unsqueeze_shape_dims = output_dims_spec + (["1"] if output_vector_rank == 1 else ["1", "1"])
+            unsqueeze_shape_expr = self.generate_shape_init(output_is_dynamic_shape, unsqueeze_shape_dims)
+            output_vec_param = self._get_output_vec_param(ntt_op_str, lhs_vec_param, rhs_vec_param)
+            vector_type_str = self.get_element_cpp_type(datatype.cpp_type, output_vector_rank, output_vec_param)
+            code.append(f"auto ntt_golden_unsqueeze = ntt::make_tensor<{vector_type_str}>({unsqueeze_shape_expr});")
+            dims_spec_len = len(output_dims_spec)
+            pack_dims = f"{dims_spec_len}" if output_vector_rank == 1 else f"{dims_spec_len}, {dims_spec_len + 1}"
+            code.append(f"ntt::pack(*ntt_golden_{datatype.cpp_type}_scalar, ntt_golden_unsqueeze, fixed_shape_v<{pack_dims}>);")
+            code.append(f"auto ntt_golden = ntt_golden_unsqueeze.squeeze( (fixed_shape_v<{pack_dims}>));")
+        else:
+            # 4.a if ntt_output is not tensor of vector
+            code.append("// 4.a if ntt_output is not tensor of vector")
+            code.append(f"auto ntt_golden = *ntt_golden_{datatype.cpp_type}_scalar;")
+
+        code.append("")
+        return code
+
+    def is_div_operation(self) -> bool:
+        """Check if the current operation is division, to disable zero generation."""
+        result = (hasattr(self, 'ntt_op_str') and self.ntt_op_str in ["div", "mod", "floor_mod", "ceil_div"])
+        return result
+
+    def generate_test_name(self, datatype, lhs_is_dynamic_shape, rhs_is_dynamic_shape, 
+        lhs_dims_spec, rhs_dims_spec, 
+        lhs_vector_rank, rhs_vector_rank, 
+        lhs_continuity, rhs_continuity, test_name_suffix):
+        
+        parts = []
+        
+        #1. datatype
+        parts.append(f"{datatype.name_suffix}")
+        
+        # 2.  lhs dynamic
+        lhs_shape_type = "dynamic" if lhs_is_dynamic_shape else "fixed"
+        parts.append(f"lhs_{lhs_shape_type}")
+        
+        # lhs vector rank
+        if lhs_vector_rank == 0:
+            parts.append("scalar")
+        else:
+            parts.append(f"{lhs_vector_rank}D_vector")
+        
+        #  contiguous->view, non_contiguous->raw_tensor
+        if lhs_continuity.is_contiguous:
+            parts.append("raw_tensor")
+        else:
+            op_str = "mul2" if lhs_continuity.big_tensor_op == "*2" else "add3" if lhs_continuity.big_tensor_op == "+3" else "add7"
+            parts.append(f"view_{lhs_continuity.non_contiguous_dim}_{op_str}")
+        
+        # 3. rhs
+        rhs_shape_type = "dynamic" if rhs_is_dynamic_shape else "fixed"
+        parts.append(f"rhs_{rhs_shape_type}")
+        
+        # rhs vector rank
+        if rhs_vector_rank == 0:
+            parts.append("scalar")
+        else:
+            parts.append(f"{rhs_vector_rank}D_vector")
+        
+        #  continuity
+        if rhs_continuity.is_contiguous:
+            parts.append("raw_tensor")
+        else:
+            op_str = "mul2" if rhs_continuity.big_tensor_op == "*2" else "add3" if rhs_continuity.big_tensor_op == "+3" else "add7"
+            parts.append(f"view_dim{rhs_continuity.non_contiguous_dim}_{op_str}")
+        
+        # 4. braodcast type
+        if lhs_dims_spec == rhs_dims_spec:
+            broadcast_info = "no_broadcast"
+        elif lhs_dims_spec == [1]:
+            broadcast_info = "lhs_singleton_broadcast"  # [1] 表示单元素广播
+        elif rhs_dims_spec == [1]:
+            broadcast_info = "rhs_singleton_broadcast"  # [1] 表示单元素广播
+        elif len(lhs_dims_spec) == 1 and len(rhs_dims_spec) > 1:
+            broadcast_info = "lhs_1d_broadcast"  # 左操作数是一维张量广播
+        elif len(rhs_dims_spec) == 1 and len(lhs_dims_spec) > 1:
+            broadcast_info = "rhs_1d_broadcast"  # 右操作数是一维张量广播
+        else:
+            broadcast_info = "multi_broadcast"  # 多维广播
+            
+        parts.append(broadcast_info)
+        if test_name_suffix:
+            parts.append(test_name_suffix)
+        
+        return "_".join(parts)
+
+    def get_binary_output_shape(self, lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+                                lhs_shape, rhs_shape):
+        output_is_dynamic_shape = lhs_is_dynamic_shape or rhs_is_dynamic_shape
+
+        if len(lhs_shape) < len(rhs_shape):
+            shorter_shape, longer_shape = lhs_shape, rhs_shape
+        else:
+            shorter_shape, longer_shape = rhs_shape, lhs_shape
+
+        # Prepend 1s to the shorter shape to match the rank of the longer shape for broadcasting.
+        rank_diff = len(longer_shape) - len(shorter_shape)
+        padded_shorter_shape = [1] * rank_diff + shorter_shape
+        
+        # Check for broadcasting compatibility.
+        for dim1, dim2 in zip(longer_shape, padded_shorter_shape):
+            assert dim1 == dim2 or min(dim1, dim2) == 1, \
+                f"Shapes {lhs_shape} and {rhs_shape} are not broadcast-compatible"
+        
+        # The output shape is the element-wise maximum of the two shapes.
+        output_shape = [max(dim1, dim2) for dim1, dim2 in zip(longer_shape, padded_shorter_shape)]
+        
+        return output_is_dynamic_shape, output_shape
+
+
+    def get_op_call_lines(self, ntt_op_str):
+        """Generate NTT binary operation code"""
+        return [
+            "// Execute binary operation",
+            f"ntt::binary<ntt::ops::{ntt_op_str}>(ntt_input_lhs, ntt_input_rhs, ntt_output);",
+            ""
+        ]
+
+    def generate_ort_output(self, datatype, ntt_op_str):
+        ort_type = self.ort_datatype_map.get(datatype.cpp_type, 'DataType_FLOAT')
+        op_str = self.op_str_map[ntt_op_str]
+        if callable(op_str):
+         op_str = op_str(datatype)
+        return [
+            "// Execute binary operation",
+            f"{op_str}",
+            ""
+        ]
+
+
+
+
+    def _get_output_vector_rank(self, ntt_op_str, lhs_vector_rank, rhs_vector_rank):
+        """Determine the output vector rank based on the operation type and input ranks."""
+        if ntt_op_str == "inner_product":
+            return 0
+        elif ntt_op_str == "outer_product":
+            if lhs_vector_rank == 0 and rhs_vector_rank == 0:
+                return 0
+            elif lhs_vector_rank == 1 or rhs_vector_rank == 1:
+                return 2
+        else:
+            return max(lhs_vector_rank, rhs_vector_rank)
+        
+    def _get_output_vec_param(self, ntt_op_str, lhs_vec_param, rhs_vec_param):
+        """Determine the output pack parameter based on the operation type and input pack parameters."""
+        if ntt_op_str == "outer_product":
+            # For outer_product, return a tuple of both pack parameters
+            return (lhs_vec_param, rhs_vec_param)
+        else:
+            return lhs_vec_param if lhs_vec_param else rhs_vec_param
+    def generate_ort_golden_output(self, datatype, 
+                                    lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+                                    lhs_dims_spec, rhs_dims_spec,
+                                    lhs_vector_rank, rhs_vector_rank,
+                                    lhs_continuity, rhs_continuity,
+                                    lhs_vec_param, rhs_vec_param,
+                                    ntt_op_str):
+        code = []
+        
+        # Check if datatype needs special fp8 handling
+        need_cast_in_ntt = datatype.cpp_type in self.types_need_cast_in_ntt
+        
+        if need_cast_in_ntt:
+            # Special handling for fp8 types that cannot be cast in ORT
+            code.extend(self._generate_cast_golden_output(
+                datatype, lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+                lhs_dims_spec, rhs_dims_spec, lhs_vector_rank, rhs_vector_rank,
+                lhs_continuity, rhs_continuity, lhs_vec_param, rhs_vec_param, ntt_op_str
+            ))
+        else:
+            # Original logic for non-fp8 types
+            # Check if datatype needs to be cast to float32
+            types_to_cast_in_ort = self.types_need_cast_in_ort.get(ntt_op_str, self.types_need_cast_in_ort["default"])
+            need_cast_in_ort = datatype.cpp_type in types_to_cast_in_ort
+                
+            lhs_continuity_var_name, lhs_copy_code = self.prepare_contiguous_input(
+                "ntt_input_lhs", datatype, lhs_vector_rank, lhs_vec_param,
+                lhs_is_dynamic_shape, lhs_dims_spec, lhs_continuity
+            )
+            code.extend(lhs_copy_code)
+            ort_input_lhs = lhs_continuity_var_name
+
+            rhs_continuity_var_name, rhs_copy_code = self.prepare_contiguous_input(
+                "ntt_input_rhs", datatype, rhs_vector_rank, rhs_vec_param,
+                rhs_is_dynamic_shape, rhs_dims_spec, rhs_continuity
+            )
+            code.extend(rhs_copy_code)
+            ort_input_rhs = rhs_continuity_var_name
+
+            if need_cast_in_ort:
+                #original version:  only do binary operation in ort, all cast is done in ntt.
+                # Cast inputs to double before sending to ort
+                # code.append("// Cast inputs to float32 for ORT computation")
+                
+                # # Lambda function to cast input to float32
+                # cast_to_float = lambda side, input_var, vector_rank, vec_param, is_dynamic, dims_spec: (
+                #     code.append(f"auto ntt_{side}_double = ntt::make_tensor<{self.get_element_cpp_type('double', vector_rank, vec_param)}>({self.generate_shape_init(is_dynamic, dims_spec)});"),
+                #     code.append(f"ntt::cast({input_var}, ntt_{side}_double);")
+                # )
+                
+                # # Cast both inputs
+                # cast_to_float("lhs", ort_input_lhs, lhs_vector_rank, lhs_vec_param, lhs_is_dynamic_shape, lhs_dims_spec)
+                # cast_to_float("rhs", ort_input_rhs, rhs_vector_rank, rhs_vec_param, rhs_is_dynamic_shape, rhs_dims_spec)
+                
+                # # Update variable references
+                # ort_input_lhs = "ntt_lhs_double"
+                # ort_input_rhs = "ntt_rhs_double"
+                
+                #new version: do binary operation in ort, and cast is also put in ort.
+                code.append("// ort_input_lhs, ort_input_rhs would be tensor of double in ort format")
+
+                code.append("")
+
+            need_cast_str = "true" if need_cast_in_ort else "false"
+            is_outer_product = "true" if ntt_op_str == "outer_product" else "false"
+
+            code.extend([f"auto [ort_input_lhs, ort_input_rhs] = NttTest::convert_and_align_to_ort(ntt_input_lhs, ntt_input_rhs, {need_cast_str}, {is_outer_product});"])
+     
+            code.extend(self.generate_ort_output(datatype, ntt_op_str))
+
+            if need_cast_in_ort:
+                code.append("// Cast outputs from double to original datatype")
+                original_type = self.ort_datatype_map[datatype.cpp_type]
+                var_name_cast_to_orig_type = "ort_output"
+                if("uint" in datatype.cpp_type):
+                    var_name_cast_to_orig_type = "ort_golden_int"
+                    code.append(f"auto {var_name_cast_to_orig_type} = ortki_Cast(ort_output, 1, ortki::DataType_INT64);")
+                code.append(f"auto ort_golden = ortki_Cast({var_name_cast_to_orig_type}, 1, ortki::{original_type});")
+            else:
+                code.append(f"auto ort_golden = ort_output;")
+
+        return code
+
+    def generate_ntt_output_to_test(self, lhs_datatype, rhs_datatype,
+                                    lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+                                    lhs_dims_spec, rhs_dims_spec,
+                                    lhs_vector_rank, rhs_vector_rank,
+                                    lhs_continuity, rhs_continuity,
+                                    lhs_vec_param, rhs_vec_param,
+                                    ntt_op_str):
+        indent = "    "
+        code = []
+        datatype = lhs_datatype  # Assume same datatype for both inputs
+        # generate ntt_input_lhs, ntt_input_rhs, ntt_output
+        code.append(f"{indent}//---init ntt_input_lhs---")
+        tensor_init_lhs_code = self.generate_tensor_init( datatype=lhs_datatype,
+            shape_type=lhs_is_dynamic_shape, dim_spec=lhs_dims_spec,
+            continuity=lhs_continuity, var_name="ntt_input_lhs",
+            name_suffix="_lhs", vector_rank=lhs_vector_rank,
+            P=lhs_vec_param, integer_only= lhs_datatype.integer_only)
+        code.extend([f"{indent}{line}" for line in tensor_init_lhs_code])
+
+        code.append(f"{indent}//---init ntt_input_rhs---")
+        tensor_init_rhs_code = self.generate_tensor_init( datatype=rhs_datatype,
+            shape_type=rhs_is_dynamic_shape, dim_spec=rhs_dims_spec,
+            continuity=rhs_continuity, var_name="ntt_input_rhs",
+            name_suffix="_rhs", vector_rank=rhs_vector_rank,
+            P=rhs_vec_param, integer_only= rhs_datatype.integer_only)
+        code.extend([f"{indent}{line}" for line in tensor_init_rhs_code])
+
+        output_is_dynamic_shape, output_dims_spec = self.get_binary_output_shape(
+            lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+            lhs_dims_spec, rhs_dims_spec)
+        
+        output_vector_rank = self._get_output_vector_rank( ntt_op_str, lhs_vector_rank, rhs_vector_rank)
+        code.append(f"{indent}//---generate output tensor---")
+
+        output_shape_expr = self.generate_shape_init(output_is_dynamic_shape, output_dims_spec)
+        # For binary ops, output vector rank matches inputs. Assume lhs.
+        output_vec_param =  self._get_output_vec_param(ntt_op_str, lhs_vec_param, rhs_vec_param)
+        output_element_type = self.get_element_cpp_type(datatype.cpp_type, output_vector_rank, output_vec_param)
+
+        output_op_call_lines = self.get_op_call_lines(ntt_op_str)
+        ntt_output_and_op_code = self.generate_ntt_output_and_op_section(
+            datatype=datatype,
+            output_shape_expr=output_shape_expr,
+            cast_mode=0,  # Placeholder for now
+            ntt_op_call_lines=output_op_call_lines,
+            output_var_name="ntt_output",
+            output_element_type=output_element_type
+        )
+        code.extend([f"{indent}{line}" for line in ntt_output_and_op_code])
+        return code, output_shape_expr, output_element_type
+
+
+
+
+    # lhs_dynamic: bool, lhs is dynamic or fixed
+    # rhs_dynamic: bool, rhs is dynamic or fixed
+    # lhs_shape: list[int], lhs shape, [1, 77, 3]
+    # rhs_shape: list[int], rhs shape, [1, 77, 3]
+    # braodcast_ways: list[int], broadcast ways, 0: no_broadcast 1: lhs_to_rhs, 2: rhs_to_lhs, 
+    # lhs_vector_ranks: list[int], lhs vector ranks, 0, 1, 2
+    # rhs_vector_ranks: list[int], rhs vector ranks, 0, 1, 2, 3
+    # lhs_tensor: list[int], lhs is tensor or view, 0: tensor, 1: view
+    # rhs_tensor: list[int], rhs is tensor or view
+    def generate_test_case(
+            self,
+            lhs_datatype,
+            rhs_datatype,
+            lhs_is_dynamic_shape: bool,
+            rhs_is_dynamic_shape: bool,
+            lhs_dims_spec: List[int],
+            rhs_dims_spec: List[int],
+            lhs_vector_rank: int,
+            rhs_vector_rank: int,
+            lhs_continuity: Continuity,
+            rhs_continuity: Continuity,
+            ntt_op_str, test_name_suffix=None):
+        # only support same datatype but different range now
+        assert lhs_datatype.cpp_type == rhs_datatype.cpp_type
+        
+        datatype = lhs_datatype
+        self.ntt_op_str = ntt_op_str  # Store operation type for is_div_operation check
+        
+
+        test_name = self.generate_test_name(datatype, lhs_is_dynamic_shape, rhs_is_dynamic_shape, 
+            lhs_dims_spec, rhs_dims_spec, 
+            lhs_vector_rank, rhs_vector_rank, 
+            lhs_continuity, rhs_continuity, test_name_suffix)
+
+
+        P = f"NTT_VLEN / (sizeof({datatype.cpp_type}) * 8)"
+        code: List[str] = []
+        lhs_vec_param = "P" if lhs_vector_rank > 0 else None
+        rhs_vec_param = "P" if rhs_vector_rank > 0 else None
+
+        # 1. Test header and constants
+        code.extend(self.generate_function_name(f"BinaryTest{ntt_op_str}", datatype, test_name))
+        if lhs_vector_rank > 0 or rhs_vector_rank > 0:
+            code.extend(self.generate_P_constants(P))
+
+        # # Generate output to test in ntt format
+        ntt_output_code, output_shape_expr, output_element_type = self.generate_ntt_output_to_test(
+                            lhs_datatype, rhs_datatype,
+                            lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+                            lhs_dims_spec, rhs_dims_spec,
+                            lhs_vector_rank, rhs_vector_rank,
+                            lhs_continuity, rhs_continuity,
+                            lhs_vec_param, rhs_vec_param,
+                            ntt_op_str)
+        code.extend(ntt_output_code)
+
+
+        # Generate golden output in ort format
+        golden_output_code = self.generate_ort_golden_output(datatype,lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+            lhs_dims_spec, rhs_dims_spec,
+            lhs_vector_rank, rhs_vector_rank,
+            lhs_continuity, rhs_continuity,
+            lhs_vec_param, rhs_vec_param,
+            ntt_op_str)
+        code.extend([f"    {line}" for line in golden_output_code])
+        types_to_cast = self.types_need_cast_in_ort.get(ntt_op_str, self.types_need_cast_in_ort["default"])
+        # cast_mode = 2 if datatype.cpp_type in types_to_cast else 0
+        # set cast mode for back to ntt function
+        cast_mode = 0
+        # Compare outputs
+        if(datatype.cpp_type in self.types_need_cast_in_ntt): # fp8 
+            cast_mode = 4
+        
+        compare_code = self.generate_ort_back2ntt_and_compare_section(
+            datatype,
+            output_element_type,
+            output_shape_expr,
+            cast_mode=cast_mode,
+            ntt_output_var_name="ntt_output",
+            ort_output_var_name="ort_golden")
+        code.extend([f"    {line}" for line in compare_code])
+
+        return "\n".join(code)
+
+    def _generate_pow_test_case_pair(
+            self, lhs_datatype, rhs_datatype,
+            lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+            lhs_dims_spec, rhs_dims_spec,
+            lhs_vector_rank, rhs_vector_rank,
+            lhs_continuity, rhs_continuity,
+            ntt_op_str):
+        
+        test_cases = []
+        
+        if lhs_datatype.cpp_type in self.integer_types:
+            # Case 1: integer types - rhs is non-negative integer
+            pow_ranges = self.ALL_POW_OPRANDS.get(rhs_datatype.cpp_type)
+            lhs_datatype = lhs_datatype._replace(
+                min_val=pow_ranges["lhs_min"],
+                max_val=pow_ranges["lhs_max"]
+            )
+            rhs_datatype = rhs_datatype._replace(
+                integer_only=True,
+                min_val=pow_ranges["rhs_min"],
+                max_val=pow_ranges["rhs_max"]
+            )
+            test_code = self.generate_test_case(
+                lhs_datatype, rhs_datatype,
+                lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+                lhs_dims_spec, rhs_dims_spec,
+                lhs_vector_rank, rhs_vector_rank,
+                lhs_continuity, rhs_continuity,
+                ntt_op_str
+            )
+            test_cases.append(test_code)
+        else:
+            # Case 2.1: floating point types - rhs as integer
+            pow_ranges = self.ALL_POW_OPRANDS.get(lhs_datatype.cpp_type)
+            lhs_datatype = lhs_datatype._replace( 
+                integer_only=False,
+                min_val=pow_ranges["lhs_min"],
+                max_val=pow_ranges["lhs_max"]
+            )
+            rhs_datatype= rhs_datatype._replace(
+                integer_only=True,
+                min_val=pow_ranges["rhs_min"],
+                max_val=pow_ranges["rhs_max"]
+            )
+            test_code1 = self.generate_test_case(
+                lhs_datatype, rhs_datatype,
+                lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+                lhs_dims_spec, rhs_dims_spec,
+                lhs_vector_rank, rhs_vector_rank,
+                lhs_continuity, rhs_continuity,
+                ntt_op_str, "rhs_int"
+            )
+            test_cases.append(test_code1)
+            zero_val_map = {
+                "bfloat16": "0.0_bf16",
+                "half": "half(0.0)",
+                "float_e4m3_t": "float_e4m3_t(0.0f)",
+                "float_e5m2_t": "float_e5m2_t(0.0f)",
+            }
+            # Case 2.2: lhs is positive - rhs as float
+            lhs_datatype = lhs_datatype._replace(
+                min_val = zero_val_map.get(lhs_datatype.cpp_type, "0.0")
+            )
+            rhs_datatype = rhs_datatype._replace(
+                integer_only=False
+            )
+            
+            test_code2 = self.generate_test_case(
+                lhs_datatype, rhs_datatype,
+                lhs_is_dynamic_shape, rhs_is_dynamic_shape,
+                lhs_dims_spec, rhs_dims_spec,
+                lhs_vector_rank, rhs_vector_rank,
+                lhs_continuity, rhs_continuity,
+                ntt_op_str, "rhs_float"
+            )
+            test_cases.append(test_code2)
+        
+        return "\n".join(test_cases)
+
+    def generate_all_tests_for_type(self, datatype, op_str):
+        code = []
+        
+        # Define combinations for test cases
+        is_dynamic_options = [False, True]
+        is_view_options = [False, True]
+        vector_rank_options = [0, 1, 2]  # 0: tensor, 1: 1d vector, etc. Keep it simple for now
+
+        code.append(self.generate_header())
+
+        # Generate custom ORT functions if needed
+        if op_str in self.ort_custom_function:
+            custom_op_code = self.generate_ort_custom_op(datatype, op_str)
+            code.append(custom_op_code)
+
+        # Choose appropriate dims_specs based on op_str
+        dims_specs_to_use = self.dims_specs_options.get(op_str, self.dims_specs_options["default"])
+        
+        param_combinations = itertools.product(
+            is_dynamic_options,          # lhs_is_dynamic_shape 2
+            is_dynamic_options,          # rhs_is_dynamic_shape 2
+            dims_specs_to_use,           # (lhs_dims_spec, rhs_dims_spec) 6
+            vector_rank_options,         # lhs_vector_rank 3
+            vector_rank_options,         # rhs_vector_rank 3
+            self.simple_continuities,         # lhs_continuity
+            self.simple_continuities          # rhs_continuity
+        )
+        # 2*2*6*3*3*2*2*2*2/4 = 3456/4 = 864
+        for lhs_is_dynamic, rhs_is_dynamic, (lhs_shape, rhs_shape), lhs_vec_rank, rhs_vec_rank, lhs_continuity, rhs_continuity in param_combinations:
+            # Skip invalid combinations if any in the future
+            # one element but not contiguous
+            if not lhs_continuity.is_contiguous and (lhs_shape == [1]):
+                continue
+            if rhs_shape == [1] and not rhs_continuity.is_contiguous:
+                continue
+
+            # set non_contiguous_dim for 1 dimension tensor
+            if not lhs_continuity.is_contiguous and lhs_shape == [16]:
+                lhs_continuity = lhs_continuity._replace(non_contiguous_dim=0)
+            if not rhs_continuity.is_contiguous and rhs_shape == [16]:
+                rhs_continuity = rhs_continuity._replace(non_contiguous_dim=0)
+            
+            # Filter vector rank combinations for inner_product
+            if op_str == "inner_product" or op_str == "outer_product":
+                # Only allow: scalar x scalar, or 1D vector x 1D vector
+                if not ((lhs_vec_rank == 0 and rhs_vec_rank == 0) or (lhs_vec_rank == 1 and rhs_vec_rank == 1)):
+                    continue
+            
+            if(op_str == "pow"):
+                # 1. lhs is neg or pos, rhs is int
+                # 2. lhs is pos, rhs is float
+                test_code = self._generate_pow_test_case_pair(
+                    datatype, datatype,
+                    lhs_is_dynamic_shape=lhs_is_dynamic,
+                    rhs_is_dynamic_shape=rhs_is_dynamic,
+                    lhs_dims_spec=lhs_shape,
+                    rhs_dims_spec=rhs_shape,
+                    lhs_vector_rank=lhs_vec_rank,
+                    rhs_vector_rank=rhs_vec_rank,
+                    lhs_continuity=lhs_continuity,
+                    rhs_continuity=rhs_continuity,
+                    ntt_op_str=op_str
+                )
+                code.append(test_code)
+            else:
+                test_code = self.generate_test_case(
+                    datatype, datatype,
+                    lhs_is_dynamic_shape=lhs_is_dynamic,
+                    rhs_is_dynamic_shape=rhs_is_dynamic,
+                    lhs_dims_spec=lhs_shape,
+                    rhs_dims_spec=rhs_shape,
+                    lhs_vector_rank=lhs_vec_rank,
+                    rhs_vector_rank=rhs_vec_rank,
+                    lhs_continuity=lhs_continuity,
+                    rhs_continuity=rhs_continuity,
+                    ntt_op_str=op_str
+                )
+                code.append(test_code)
+
+        code.append(self.generate_footer())
+        return "\n".join(code)
+
+def generate_tests_for_op(op_str, generator):
+    for datatype in ALL_DATATYPES:
+        if datatype.cpp_type == "bool":
+            continue
+        if op_str == "ceil_div" and (datatype.cpp_type not in generator.integer_types):
+            # Skip ceil_div for non-integer types, as it is only supported for integers
+            continue
+
+        test_code = generator.generate_all_tests_for_type(datatype, op_str)
+        filename = f"test_ntt_binary_{datatype.name_suffix.lower()}_{op_str}_generated.cpp"
+        output_filepath = os.path.join(generated_directory, filename)
+
+        with open(output_filepath, "w") as f:
+            f.write(test_code)
+        
+        print(f"Test file generated: {output_filepath}")
+        generated_filenames.append(filename)
+    
+
+if __name__ == "__main__":
+    generator = BinaryTestGenerator()
+    script_directory = os.path.dirname(os.path.abspath(__file__))
+    # Get the parent directory (ctest) and then the generated subdirectory
+    ctest_directory = os.path.dirname(script_directory)
+    generated_directory = os.path.join(ctest_directory, "generated")
+    
+    # Ensure generated directory exists
+    os.makedirs(generated_directory, exist_ok=True)
+    
+    generated_filenames = []  # collect all generated file names
+
+    # for datatype in ALL_DATATYPES:
+    #     test_code = generator.generate_all_tests_for_type(datatype)
+    #     filename = f"test_ntt_binary_{datatype.name_suffix.lower()}_generated.cpp"
+    #     output_filepath = os.path.join(generated_directory, filename)
+
+    #     with open(output_filepath, "w") as f:
+    #         f.write(test_code)
+        
+    #     print(f"Test file generated: {output_filepath}")
+    #     generated_filenames.append(filename)
+    
+    for op_str in generator.op_str_map.keys():
+        generate_tests_for_op(op_str, generator)
+    # Generate cmake list file in the generated directory
+    generate_cmake_list(generated_directory, generated_filenames, "generated_binary_tests.cmake", "GENERATED_BINARY_TEST_SOURCES")
\ No newline at end of file
diff --git a/ntt/test/ctest/test_generator/generate_cast_tests.py b/ntt/test/ctest/test_generator/generate_cast_tests.py
new file mode 100644
index 0000000000..abce369a8e
--- /dev/null
+++ b/ntt/test/ctest/test_generator/generate_cast_tests.py
@@ -0,0 +1,436 @@
+#!/usr/bin/env python3
+"""
+Generate test cases for NTT cast operations
+Covering the following cases:
+1. Input/Output type combinations: all 15 * 14 type pairs
+2. Shape types: fixed/dynamic
+3. Vector dimensions: scalar/1D/2D
+4. Tensor continuity: contiguous/non-contiguous
+5. Tensor dimensions: 3D/4D
+"""
+
+import itertools
+from typing import List, Tuple
+from test_generator_base import *
+import os
+
+class CastTestGenerator(BaseTestGenerator):
+    def __init__(self):
+        super().__init__()
+        
+        # Element type sizes in bytes
+        self.element_type_lengths = {
+            'uint8_t': 1, 'int8_t': 1, 'bool': 1,
+            'uint16_t': 2, 'int16_t': 2, 'half': 2, 'bfloat16': 2,
+            'uint32_t': 4, 'int32_t': 4, 'float': 4,
+            'uint64_t': 8, 'int64_t': 8, 'double': 8,
+            'float_e4m3_t': 1, 'float_e5m2_t': 1
+        }
+
+        
+    def generate_test_name(self, from_type, to_type, shape_type, vector_rank, continuity: Continuity, ndim, repackedAxes=None):
+        parts = []
+        parts.append(f"from_{from_type.name_suffix}_to_{to_type.name_suffix}")
+        parts.append(shape_type)
+        
+        if vector_rank == 0:
+            parts.append("scalar")
+        else:
+            parts.append(f"{vector_rank}D_vector")
+        
+        if continuity.is_contiguous:
+            parts.append("contiguous")
+        else:
+            op_str = "mul2" if continuity.big_tensor_op == "*2" else "add5"
+            parts.append(f"non_contiguous_dim{continuity.non_contiguous_dim}_{op_str}")
+
+        parts.append(f"{ndim}D")
+        
+        if repackedAxes is not None:
+            parts.append(f"repack_axis_{repackedAxes[0]}")
+        
+        return "_".join(parts)
+
+    def generate_ort_output(self, to_type):
+        """Generate ORT reference implementation for cast operation"""
+        ort_type = self.ort_datatype_map.get(to_type.cpp_type, 'DataType_FLOAT')
+        return [
+            "// ORT reference implementation",
+            f"auto ort_output = ortki_Cast(ort_input, 1, {ort_type});",
+            ""
+        ]
+
+    def generate_ntt_ops(self, repackedAxes=None):
+        """Generate NTT cast operation code"""
+        if repackedAxes is not None:
+            axes_str = f"ntt::fixed_shape_v<{repackedAxes[0]}>"
+            return [
+                "// Execute cast operation",
+                f"ntt::cast(ntt_input, ntt_output1, {axes_str});",
+                ""
+            ]
+        else:
+            return [
+                "// Execute cast operation",
+                "ntt::cast(ntt_input, ntt_output1);",
+                ""
+            ]
+
+
+    def generate_ntt_output_to_test(self, from_type, to_type, shape_type, dims_spec, continuity, vector_rank, P, repackedAxes=None):
+        """Generate the NTT output to be tested"""
+        code = []
+
+        cast_min_value, cast_max_value = clamp_value_strings(from_type, to_type) 
+        cast_data_type = from_type._replace(min_val=cast_min_value, max_val=cast_max_value)
+        # 1. NTT input creation
+        code.extend(self.generate_tensor_init(
+            datatype=cast_data_type,
+            shape_type=shape_type,
+            dim_spec=dims_spec,
+            continuity=continuity,
+            vector_rank=vector_rank,
+            P=P,
+            var_name="ntt_input",
+            name_suffix=""))
+
+        # 2. NTT output tensor creation
+        from_ele_len = self.element_type_lengths.get(from_type.cpp_type, 4)
+        to_ele_len = self.element_type_lengths.get(to_type.cpp_type, 4)
+        if(from_type.cpp_type == "bool"):
+            from_ele_len = to_ele_len
+        if(to_type.cpp_type == "bool"):
+            to_ele_len = from_ele_len
+
+        # Calculate output P and adjust dimensions for 1D vector with different element type length
+        if vector_rank == 1 and repackedAxes is not None and from_ele_len != to_ele_len:
+            # Adjust the P of output tensor: P = P / (output_ele_len / input_ele_len)
+            scale_factor = to_ele_len // from_ele_len if to_ele_len > from_ele_len else from_ele_len // to_ele_len
+            if to_ele_len > from_ele_len:
+                output_P = f"P / {scale_factor}"
+            else:
+                output_P = f"P * {scale_factor}"
+            
+            # Adjust the axes of output tensor: axes[repackedAxes[0]] = axes[repackedAxes[0]] * (output_ele_len / input_ele_len)
+            adjusted_dims_spec = [str(dim) for dim in dims_spec]
+            repack_axis = repackedAxes[0]
+            if to_ele_len > from_ele_len:
+                adjusted_dims_spec[repack_axis] = f"{dims_spec[repack_axis]} * {scale_factor}"
+            else:
+                adjusted_dims_spec[repack_axis] = f"{dims_spec[repack_axis]} / {scale_factor}"
+            output_element_type = self.get_element_cpp_type(to_type.cpp_type, vector_rank, output_P)
+            output_shape_expr = self.generate_shape_init(shape_type, adjusted_dims_spec)
+        else:
+            output_element_type = self.get_element_cpp_type(to_type.cpp_type, vector_rank, P)
+            output_shape_expr = self.generate_shape_init(shape_type, [str(dim) for dim in dims_spec])
+
+        code.append(f"// Create output tensor")
+        code.append(f"auto ntt_output1 = ntt::make_tensor<{output_element_type}>({output_shape_expr});")
+        code.append("")
+
+        # 3. NTT operation (cast)
+        cast_call_code = self.generate_ntt_ops(repackedAxes)
+
+        op_code = self.generate_ntt_operation_section(cast_call_code)
+        code.extend(op_code)
+
+        return code, output_shape_expr, output_element_type
+
+    # for fp8, golden is derived from the apply operation to cast tensor elementwisely.
+    def generate_ntt_cast_golden_output_fp8(self, from_type, to_type, shape_type, dims_spec, continuity, P, vector_rank):
+        code = []
+        from_ele_len = self.element_type_lengths.get(from_type.cpp_type, 4)
+        to_ele_len = self.element_type_lengths.get(to_type.cpp_type, 4)
+        scale = from_ele_len // to_ele_len if from_ele_len > to_ele_len else to_ele_len // from_ele_len
+        tensor_element_type = self.get_element_cpp_type(from_type.cpp_type, vector_rank, P)
+
+
+        output_element_type = self.get_element_cpp_type(to_type.cpp_type, vector_rank, P)
+
+        # 1. copy to contiguous tensor of scalar or vector
+        if not continuity.is_contiguous:
+            copy_code, continuous_input_var_name = self.generate_copy_to_contiguous_code(tensor_element_type, shape_type, dims_spec)
+            code.extend(copy_code)
+        else:
+            continuous_input_var_name = "ntt_input"
+
+        unpack_axes = [len(dims_spec)-1] if vector_rank == 1 else [len(dims_spec)-2, len(dims_spec)-1]
+        # 2. unpack to scalar tensor
+        if 'vector' in tensor_element_type:
+            # 2x4 vector<int32, 8> to 2x(4/2) vector<int16, 8*2>
+            out_vec_dims_spec = dims_spec.copy()
+            out_vec_dims_spec[-1]  = ((dims_spec[-1] // scale) if from_ele_len > to_ele_len 
+                                    else (dims_spec[-1] * scale))
+            
+
+            output_element_type = self.get_element_cpp_type(
+                    to_type.cpp_type, vector_rank, 
+                    (f"{P} * {scale}" if from_ele_len > to_ele_len else f"{P} / {scale}") )
+
+            unpacked_dims = self.get_unpacked_dims(dims_spec, unpack_axes)
+            code.append(f"auto ntt_scalar_input = ntt::make_tensor<{from_type.cpp_type}>({self.generate_shape_init(shape_type, unpacked_dims)});")
+            code.append(f"ntt::unpack({continuous_input_var_name}, ntt_scalar_input, {self.generate_pack_axes_str(unpack_axes)});")
+        else:
+            code.append(f"auto ntt_scalar_input = {continuous_input_var_name};")
+        #3. generate golden output
+        code.append(f"auto ntt_golden_scalar = ntt::make_tensor<{to_type.cpp_type}>(ntt_scalar_input.shape());")
+        code.append(
+            f"ntt::apply(ntt_golden_scalar.shape(), [&](auto& index){{\n"
+            f"      (ntt_golden_scalar)(index) = static_cast<{to_type.cpp_type}>(ntt_scalar_input(index));\n"
+            f"    }});"
+        )
+
+        # 4. generate under test scalar output 
+        if "vector" in tensor_element_type:
+            code.append(f"auto ntt_golden_vector = ntt::make_tensor<{output_element_type}>({self.generate_shape_init(shape_type, out_vec_dims_spec)});")
+            code.append(f"ntt::pack(ntt_golden_scalar, ntt_golden_vector, {self.generate_pack_axes_str(unpack_axes)});")
+            code.append(f"auto& ntt_golden = ntt_golden_vector;")
+        else:
+            code.append(f"auto& ntt_golden = ntt_golden_scalar;")
+
+
+        return code
+
+
+    def generate_ort_golden_output(self, from_type, to_type, shape_type, dims_spec, continuity, P,  vector_rank, deal_fp8, repackedAxes=None):
+        """Generate golden output using ORT or lambda-based reference"""
+        code = []
+        is_fp8_cast = 'float_e' in from_type.cpp_type or 'float_e' in to_type.cpp_type
+
+        if not is_fp8_cast:
+            # Generate ORT input section using prepare_contiguous_input
+            continuity_var_name, copy_code = self.prepare_contiguous_input(
+                "ntt_input", from_type, vector_rank, P,
+                shape_type, dims_spec, continuity
+            )
+            code.extend(copy_code)
+            ort_input_var_name = continuity_var_name
+            
+            # Add logic for 1D vector with different element type lengths
+            from_ele_len = self.element_type_lengths.get(from_type.cpp_type, 4)
+            to_ele_len = self.element_type_lengths.get(to_type.cpp_type, 4)
+            scale = from_ele_len // to_ele_len if from_ele_len > to_ele_len else to_ele_len // from_ele_len
+            if("bool" == from_type.cpp_type or "bool" == to_type.cpp_type):
+                scale = 1
+
+            if vector_rank == 1 and repackedAxes is not None and from_ele_len != to_ele_len:
+                # Calculate scale factor
+                input_rank = len(dims_spec)
+                packed_axis = repackedAxes[0]
+                
+                ort_dims_spec = [str(dim) for dim in dims_spec]
+                if from_ele_len > to_ele_len:
+                    ort_dims_spec[packed_axis] = f"{ort_dims_spec[packed_axis]} / {scale}"
+                    ort_dims_spec.insert(packed_axis + 1, str(scale))
+                    ort_dims_spec.append("P")
+                    
+                    # Create perms_data: move second to last element after packed_axis
+                    perms_data = list(range(input_rank + 2))
+                    # Move element at index input_rank-1 (the second to last element) to position packed_axis + 1
+                    element = perms_data.pop(input_rank ) # input_rank + 1 - 1
+                    perms_data.insert(packed_axis + 1, element)
+                else:
+                    # scale < 1 case (from_ele_len < to_ele_len)
+                    ort_dims_spec.append(str(scale))
+                    ort_dims_spec.append(f"P / {scale}")
+                    
+                    # Create perms_data: move second to last element after packed_axis
+                    perms_data = list(range(input_rank + 2))
+                    # Move element at index input_rank (the second to last element) to position packed_axis + 1
+                    element = perms_data.pop(input_rank ) # input_rank + 1 - 1
+                    perms_data.insert(packed_axis + 1, element)
+                
+                
+                # Generate reshape and transpose code
+                code.append("// Reshape and transpose for 1D vector cast")
+                code.append(f"int64_t reshape_data[] = {{{', '.join(ort_dims_spec)}}};")
+                code.append(f"int64_t reshape_shape[] = {{std::size(reshape_data)}};")
+                code.append("auto ort_type = NttTest::primitive_type2ort_type<int64_t>();")
+                code.append("auto shape_tensor1 = make_tensor(reinterpret_cast<void *>(reshape_data), ort_type,")
+                code.append("                         reshape_shape, std::size(reshape_shape));")
+                code.append(f"auto ort_input = NttTest::ntt2ort({ort_input_var_name});")
+                code.append(f"auto reshaped_tensor1 = ortki_Reshape(ort_input, shape_tensor1, 0);")
+                code.append("")
+                code.append(f"int64_t perms_data[] = {{{', '.join(map(str, perms_data))}}};")
+                code.append("auto ort_cast_input = ortki_Transpose(reshaped_tensor1, perms_data, std::size(perms_data));")
+                code.append("")
+                
+                # Use the transposed tensor as input for cast
+                ort_type = self.ort_datatype_map.get(to_type.cpp_type, 'DataType_FLOAT')
+                code.append(f"auto ort_output = ortki_Cast(ort_cast_input, 1, {ort_type});")
+            else:
+                # Use standard ORT output
+                code.append(f"auto ort_input = NttTest::ntt2ort({ort_input_var_name});")
+                ort_kernel_lines = self.generate_ort_output(to_type)
+                code.extend(ort_kernel_lines)
+        else:
+            # Use lambda-based reference
+            code.extend(self.generate_ntt_cast_golden_output_fp8(from_type, to_type, shape_type, dims_spec, continuity, P, vector_rank))
+            
+        return code
+    
+
+
+    def generate_test_case(self, from_type, to_type, shape_type, vector_rank, continuity, ndim, repackedAxes=None):
+        """Generate a single test case"""
+        # 1. Initialize dimensions and other basic variables
+        is_from_fp8 = 'float_e' in from_type.cpp_type
+        is_to_fp8 = 'float_e' in to_type.cpp_type
+        deal_fp8 = 1 if (is_from_fp8 or is_to_fp8) else 0
+        is_fp8_cast = is_from_fp8 or is_to_fp8
+
+        vector_element = from_type.cpp_type if from_type.cpp_type != "bool" else to_type.cpp_type
+
+        P = f"NTT_VLEN / (sizeof({vector_element}) * 8)"
+        if ndim == 3:
+            dims_spec = [8, 80, 8]
+        elif ndim == 4:
+            dims_spec= [8, 16, 8, 8]
+        else:
+            dims_spec= [2, 8, 4, 4, 4]
+
+
+        test_name = self.generate_test_name(from_type, to_type, shape_type, vector_rank, continuity, ndim, repackedAxes)
+        
+        code: List[str] = []
+
+
+        # 1. Test header and constants
+        code.extend(self.generate_function_name("CastTest", from_type, test_name))
+        P_would_be_used = True if vector_rank > 0 else False
+
+        if(P_would_be_used):
+            code.extend(self.generate_P_constants(P))
+        # 2. Generate output to test in NTT format
+        ntt_output_code, output_shape_expr, output_element_type = self.generate_ntt_output_to_test(
+            from_type, to_type, shape_type, dims_spec, continuity, vector_rank, "P", repackedAxes)
+        code.extend([f"    {line}" for line in ntt_output_code])
+
+        # 3. Generate golden output in ORT format, or in ntt format for fp8 cast
+        golden_output_code = self.generate_ort_golden_output(
+            from_type, to_type, shape_type, dims_spec, continuity, P,  vector_rank, deal_fp8, repackedAxes)
+    
+        code.extend([f"    {line}" for line in golden_output_code])
+
+        # 4. Compare outputs
+        if is_fp8_cast:
+            # Direct comparison for FP8 cast
+            code.extend([
+                "    // Compare results",
+                "    EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_golden));",
+                "}"
+            ])
+        else:
+            # ORT-based comparison
+            compare_code = self.generate_ort_back2ntt_and_compare_section(
+                to_type,
+                output_element_type,
+                output_shape_expr,
+                deal_fp8,
+                ntt_output_var_name="ntt_output1",
+                ort_output_var_name="ort_output")
+            code.extend([f"    {line}" for line in compare_code])
+
+        return "\n".join(code)
+
+    def generate_all_tests_for_from_type(self, from_type):
+        """Generate all test combinations for a given input datatype"""
+        shape_types = ["fixed", "dynamic"]
+        vector_ranks = [0, 1, 2]  # scalar, 1D vector, 2D vector
+        
+        # Full continuity test combinations, mainly for 4D
+        full_continuities = [
+            Continuity(is_contiguous=True, non_contiguous_dim=None, big_tensor_op=None),
+            Continuity(is_contiguous=False, non_contiguous_dim=2, big_tensor_op="+7"),
+            Continuity(is_contiguous=False, non_contiguous_dim=2, big_tensor_op="*2"),
+            Continuity(is_contiguous=False, non_contiguous_dim=1, big_tensor_op="*2"),
+            Continuity(is_contiguous=False, non_contiguous_dim=1, big_tensor_op="+7"),
+        ]
+
+        # Simplified continuity test combinations, for non-4D
+        simple_continuities = [
+            Continuity(is_contiguous=True, non_contiguous_dim=None, big_tensor_op=None),
+            Continuity(is_contiguous=False, non_contiguous_dim=1, big_tensor_op="*2"),
+        ]
+        
+        code = []
+        
+        # Generate file header
+        code.append(self.generate_header())
+        
+        # Generate test cases for all target types (except the same type)
+        for to_type in ALL_DATATYPES:
+            if from_type.cpp_type == to_type.cpp_type:
+                continue  # Skip same type cast
+            
+            # Generate test cases for different dimensions
+            for ndim in [3, 4]:
+                # Select continuity test strategy based on dimension
+                current_continuities = full_continuities if ndim == 3 else simple_continuities
+
+                for shape_type, vector_rank, continuity in itertools.product(shape_types, vector_ranks, current_continuities):
+                    # Skip unreasonable combinations
+                    if vector_rank > ndim:  # Can't have more vector dimensions than tensor dimensions
+                        continue
+                    
+                    # Determine repackedAxes choices based on vector_rank and element type lengths
+                    from_ele_len = self.element_type_lengths.get(from_type.cpp_type, 4)
+                    to_ele_len = self.element_type_lengths.get(to_type.cpp_type, 4)
+                    
+                    repackedAxes_choices = []
+                    
+                    if vector_rank == 0:  # scalar
+                        # No filter, but repackedAxes should not be set
+                        repackedAxes_choices = [None]
+                    elif vector_rank == 1:  # 1D vector
+                        if from_ele_len == to_ele_len:
+                            # For equal element type length, repackedAxes should not be set
+                            repackedAxes_choices = [None]
+                        else:
+                            # For different element type length, add repackedAxes
+                            # Only test with repackedAxes when element type lengths differ
+                            repackedAxes_choices = [[ndim-1]]  # Add repack on last axis
+                    elif vector_rank == 2:  # 2D vector
+                        # Only element type length equal case should be tested
+                        if from_ele_len == to_ele_len:
+                            repackedAxes_choices = [None]
+                        else:
+                            continue  # Skip different element type length for 2D vector
+                    # if "float_e" in to_type.cpp_type :
+                    #     continue
+                    
+                    for repackedAxes in repackedAxes_choices:
+                        test_code = self.generate_test_case(from_type, to_type, shape_type, vector_rank, continuity, ndim, repackedAxes)
+                        code.append(test_code)
+                    
+        # Generate main function
+        code.append(self.generate_footer())
+        
+        return "\n".join(code)
+
+
+if __name__ == "__main__":
+    generator = CastTestGenerator()
+    script_directory = os.path.dirname(os.path.abspath(__file__))
+    # Get the parent directory (ctest) and then the generated subdirectory
+    ctest_directory = os.path.dirname(script_directory)
+    generated_directory = os.path.join(ctest_directory, "generated")
+    
+    # Ensure generated directory exists
+    os.makedirs(generated_directory, exist_ok=True)
+    
+    generated_filenames = []  # collect all generated file names
+
+    for from_type in ALL_DATATYPES:
+        test_code = generator.generate_all_tests_for_from_type(from_type)
+        filename = f"test_ntt_cast_from_{from_type.name_suffix.lower()}_generated.cpp"
+        output_filepath = os.path.join(generated_directory, filename)
+
+        with open(output_filepath, "w") as f:
+            f.write(test_code)
+        
+        print(f"Test file generated: {output_filepath}")
+        generated_filenames.append(filename)
+    
+    # Generate cmake list file in the generated directory
+    generate_cmake_list(generated_directory, generated_filenames, "generated_cast_tests.cmake", "GENERATED_CAST_TEST_SOURCES") 
\ No newline at end of file
diff --git a/ntt/test/ctest/generate_pack_tests.py b/ntt/test/ctest/test_generator/generate_pack_tests.py
similarity index 90%
rename from ntt/test/ctest/generate_pack_tests.py
rename to ntt/test/ctest/test_generator/generate_pack_tests.py
index 61a4ab99fd..27314812cd 100644
--- a/ntt/test/ctest/generate_pack_tests.py
+++ b/ntt/test/ctest/test_generator/generate_pack_tests.py
@@ -34,11 +34,6 @@ def generate_test_name(self, datatype, shape_type, vector_dim, continuity: Conti
         return "_".join(parts)
     
     
-    def generate_pack_axes_str(self, axes):
-        if len(axes) == 1:
-            return f"ntt::fixed_shape_v<{axes[0]}>"
-        else:
-            return f"ntt::fixed_shape_v<{', '.join(map(str, axes))}>"
     
     def generate_ort_reference(self, input_dims, input_dim_names, pack_axes):
         code = []
@@ -49,10 +44,11 @@ def generate_ort_reference(self, input_dims, input_dim_names, pack_axes):
         dim_idx = 0
         for i in range(ndim):
             if i in pack_axes:
+                vec_param = "P" if i == pack_axes[-1] else "4"
                 axis_idx = pack_axes.index(i)
                 # Use string expressions instead of calculated results
-                reshape_dims_str.append(f"(int64_t)({input_dim_names[i]} / P)")
-                reshape_dims_str.append(f"(int64_t)P")
+                reshape_dims_str.append(f"(int64_t)({input_dim_names[i]} / {vec_param})")
+                reshape_dims_str.append(f"(int64_t){vec_param}")
             else:
                 reshape_dims_str.append(f"(int64_t){input_dim_names[i]}")
         
@@ -112,31 +108,30 @@ def generate_ntt_output_to_test(self, datatype, shape_type, dim_names, continuit
         code.extend(self.generate_ntt_input_section(
             datatype=datatype,
             shape_type=shape_type,
-            dim_names=dim_names,
+            dims_spec=dim_names,
             continuity=continuity,
             vector_rank=0,  # Pack input is always scalar tensor
             P=P,
-            axes_count=len(pack_axes),
             var_name="ntt_input"))
 
         # 2. NTT operation (pack)
         output_dims = []
         for i, name in enumerate(dim_names):
             if i in pack_axes:
-                output_dims.append(f"{name} / P")
+                vec_param = "P" if i == pack_axes[-1] else "4"
+                output_dims.append(f"{name} / {vec_param}")
             else:
                 output_dims.append(name)
         output_shape_expr = self.generate_shape_init(shape_type, output_dims)
         
-        output_element_type = self._build_vector_cpp_type(
-            datatype.cpp_type, vector_dim, 'P', len(pack_axes))
+        output_element_type = self.get_element_cpp_type(datatype.cpp_type, vector_dim, 'P')
 
         pack_call_code = self.generate_ntt_ops(pack_axes)
 
         op_code = self.generate_ntt_output_and_op_section(
             datatype=datatype,
             output_shape_expr=output_shape_expr,
-            deal_fp8=deal_fp8,
+            cast_mode=deal_fp8,
             ntt_op_call_lines=pack_call_code,
             output_element_type=output_element_type
         )
@@ -144,7 +139,7 @@ def generate_ntt_output_to_test(self, datatype, shape_type, dim_names, continuit
         
         return code, output_shape_expr, output_element_type
 
-    def generate_ntt_golden_output(self, datatype, shape_type, dims, dim_names, continuity, P, pack_axes, deal_fp8):
+    def generate_ort_golden_output(self, datatype, shape_type, dims, dim_names, continuity, P, pack_axes, deal_fp8):
         """
         Generates the golden output using ORT as a reference.
         This includes:
@@ -157,12 +152,11 @@ def generate_ntt_golden_output(self, datatype, shape_type, dims, dim_names, cont
         code.extend(self.generate_ort_input_section(
             datatype=datatype,
             shape_type=shape_type,
-            dim_names=dim_names,
+            dims_spec=dim_names,
             continuity=continuity,
-            deal_fp8=deal_fp8,
+            cast_mode=deal_fp8,
             P=P,
             vector_rank=0, # Pack input is scalar
-            axes_count=len(pack_axes),
             ntt_input_var_name="ntt_input"))
 
         # 2. ORT kernel exec section
@@ -182,7 +176,7 @@ def generate_test_case(self, datatype, shape_type, vector_dim, continuity, pack_
 
         P = f"NTT_VLEN / (sizeof({datatype.cpp_type}) * 8)"
         if ndim == 3:
-            dims, dim_names = [1, 77, 3], ['C', 'H', 'W']
+            dims, dim_names = [2, 77, 3], ['C', 'H', 'W']
         elif ndim == 4:
             dims, dim_names = [2, 8, 4, 4], ['N', 'C', 'H', 'W']
         else:
@@ -201,7 +195,7 @@ def generate_test_case(self, datatype, shape_type, vector_dim, continuity, pack_
         code.extend([f"    {line}" for line in ntt_output_code])
 
         # 3. Generate golden output in ort format
-        golden_output_code = self.generate_ntt_golden_output(
+        golden_output_code = self.generate_ort_golden_output(
             datatype, shape_type, dims, dim_names, continuity, P, pack_axes, deal_fp8)
         code.extend([f"    {line}" for line in golden_output_code])
 
@@ -284,12 +278,19 @@ def generate_all_tests_for_type(self, datatype):
     generator = PackTestGenerator()
     script_directory = os.path.dirname(os.path.abspath(__file__))
     
+    # Get the parent directory (ctest) and then the generated subdirectory
+    ctest_directory = os.path.dirname(script_directory)
+    generated_directory = os.path.join(ctest_directory, "generated")
+    
+    # Ensure generated directory exists
+    os.makedirs(generated_directory, exist_ok=True)
+
     generated_filenames = [] # collect all generated file names
 
     for datatype in ALL_DATATYPES:
         test_code = generator.generate_all_tests_for_type(datatype)
         filename = f"test_ntt_pack_generated_{datatype.name_suffix}.cpp"
-        output_filepath = os.path.join(script_directory, filename)
+        output_filepath = os.path.join(generated_directory, filename)
 
         with open(output_filepath, "w") as f:
             f.write(test_code)
@@ -297,4 +298,4 @@ def generate_all_tests_for_type(self, datatype):
         print(f"Test file generated: {output_filepath}")
         generated_filenames.append(filename) 
     
-    generate_cmake_list(script_directory, generated_filenames, "generated_pack_tests.cmake", "GENERATED_PACK_TEST_SOURCES")
\ No newline at end of file
+    generate_cmake_list(generated_directory, generated_filenames, "generated_pack_tests.cmake", "GENERATED_PACK_TEST_SOURCES")
\ No newline at end of file
diff --git a/ntt/test/ctest/generate_unpack_tests.py b/ntt/test/ctest/test_generator/generate_unpack_tests.py
similarity index 90%
rename from ntt/test/ctest/generate_unpack_tests.py
rename to ntt/test/ctest/test_generator/generate_unpack_tests.py
index d51c941bb2..2c773684ee 100644
--- a/ntt/test/ctest/generate_unpack_tests.py
+++ b/ntt/test/ctest/test_generator/generate_unpack_tests.py
@@ -66,12 +66,7 @@ def generate_ort_reference(self, input_dims, input_dim_names, unpack_axes, P):
             reshape_source = "ort_input"
 
         # 2. Reshape to final output shape
-        output_dims = []
-        for i, name in enumerate(input_dim_names):
-            if i in unpack_axes:
-                output_dims.append(f"{name} * P")
-            else:
-                output_dims.append(name)
+        output_dims = self.get_unpacked_dims(input_dim_names, unpack_axes)
 
         code.append(f"int64_t reshape_data[] = {{{', '.join(output_dims)}}};")
         code.append("int64_t reshape_shape[] = {std::size(reshape_data)};")
@@ -106,20 +101,14 @@ def generate_ntt_output_to_test(self, datatype, shape_type, dim_names, continuit
         code.extend(self.generate_ntt_input_section(
             datatype=datatype,
             shape_type=shape_type,
-            dim_names=dim_names,
+            dims_spec=dim_names,
             continuity=continuity,
             vector_rank=vector_dim,
             P=P,
-            axes_count=len(unpack_axes),
             var_name="ntt_input"))
 
         # 2. NTT operation (unpack)
-        output_dims = []
-        for i, name in enumerate(dim_names):
-            if i in unpack_axes:
-                output_dims.append(f"{name} * P")
-            else:
-                output_dims.append(name)
+        output_dims = self.get_unpacked_dims(dim_names, unpack_axes)
         output_shape_expr = self.generate_shape_init(shape_type, output_dims)
         
         unpack_call_code = self.generate_ntt_ops(unpack_axes)
@@ -127,14 +116,14 @@ def generate_ntt_output_to_test(self, datatype, shape_type, dim_names, continuit
         op_code = self.generate_ntt_output_and_op_section(
             datatype=datatype,
             output_shape_expr=output_shape_expr,
-            deal_fp8=deal_fp8,
+            cast_mode=deal_fp8,
             ntt_op_call_lines=unpack_call_code
         )
         code.extend(op_code)
         
         return code, output_shape_expr
 
-    def generate_ntt_golden_output(self, datatype, shape_type, dims, dim_names, continuity, vector_dim, P, unpack_axes, deal_fp8, output_shape_expr):
+    def generate_ort_golden_output(self, datatype, shape_type, dims, dim_names, continuity, vector_dim, P, unpack_axes, deal_fp8, output_shape_expr):
         """
         Generates the golden output using ORT as a reference.
         This includes:
@@ -147,12 +136,11 @@ def generate_ntt_golden_output(self, datatype, shape_type, dims, dim_names, cont
         code.extend(self.generate_ort_input_section(
             datatype=datatype,
             shape_type=shape_type,
-            dim_names=dim_names,
+            dims_spec=dim_names,
             continuity=continuity,
-            deal_fp8=deal_fp8,
+            cast_mode=deal_fp8,
             P=P,
             vector_rank=vector_dim,
-            axes_count=len(unpack_axes),
             ntt_input_var_name="ntt_input"))
 
         # 2. ORT kernel exec section
@@ -184,7 +172,7 @@ def generate_test_case(self, datatype, shape_type, vector_dim, continuity, unpac
         code.extend([f"    {line}" for line in ntt_output_code])
 
         # Generate golden output in ort format
-        golden_output_code = self.generate_ntt_golden_output(datatype, shape_type, dims, dim_names, continuity, vector_dim, P, unpack_axes, deal_fp8, output_shape_expr)
+        golden_output_code = self.generate_ort_golden_output(datatype, shape_type, dims, dim_names, continuity, vector_dim, P, unpack_axes, deal_fp8, output_shape_expr)
         code.extend([f"    {line}" for line in golden_output_code])
 
         # Compare outputs
@@ -247,12 +235,19 @@ def generate_all_tests_for_type(self, datatype):
     generator = UnpackTestGenerator()
     script_directory = os.path.dirname(os.path.abspath(__file__))
 
+    # Get the parent directory (ctest) and then the generated subdirectory
+    ctest_directory = os.path.dirname(script_directory)
+    generated_directory = os.path.join(ctest_directory, "generated")
+    
+    # Ensure generated directory exists
+    os.makedirs(generated_directory, exist_ok=True)
+
     generated_filenames = []
 
     for datatype in ALL_DATATYPES:
         test_code = generator.generate_all_tests_for_type(datatype)
         filename = f"test_ntt_unpack_generated_{datatype.name_suffix}.cpp"
-        output_filepath = os.path.join(script_directory, filename)
+        output_filepath = os.path.join(generated_directory, filename)
 
         with open(output_filepath, "w") as f:
             f.write(test_code)
@@ -260,4 +255,4 @@ def generate_all_tests_for_type(self, datatype):
         print(f"Test file generated: {output_filepath}")
         generated_filenames.append(filename)
 
-    generate_cmake_list(script_directory, generated_filenames, "generated_unpack_tests.cmake", "GENERATED_UNPACK_TEST_SOURCES")
+    generate_cmake_list(generated_directory, generated_filenames, "generated_unpack_tests.cmake", "GENERATED_UNPACK_TEST_SOURCES")
diff --git a/ntt/test/ctest/test_generator/test_generator_base.py b/ntt/test/ctest/test_generator/test_generator_base.py
new file mode 100644
index 0000000000..16f88af32a
--- /dev/null
+++ b/ntt/test/ctest/test_generator/test_generator_base.py
@@ -0,0 +1,641 @@
+#!/usr/bin/env python3
+"""
+Base classes and utilities for generating NTT test cases.
+"""
+
+import os
+from collections import namedtuple
+from typing import List, Optional
+from enum import Enum
+
+
+# is_contiguous: bool
+# non_contiguous_dim: int or None
+# big_tensor_op: str or None - How to build the big tensor at given non_contiguous_dim
+Continuity = namedtuple('Continuity', ['is_contiguous', 'non_contiguous_dim', 'big_tensor_op'])
+DataType = namedtuple('DataType', ['cpp_type', 'name_suffix', 'min_val', 'max_val', 'integer_only'])
+
+class ShapeType(Enum):
+    FIXED = "fixed"
+    DYNAMIC = "dynamic"
+
+    @classmethod
+    def from_input(cls, value):
+        """
+        A factory method to create a ShapeType enum instance from a string or boolean.
+        """
+        if isinstance(value, str):
+            try:
+                return cls(value.lower())
+            except ValueError:
+                raise ValueError(f"Invalid shape_type string: '{value}'. Must be 'fixed' or 'dynamic'.")
+        elif isinstance(value, bool):
+            return cls.DYNAMIC if value else cls.FIXED
+        elif isinstance(value, cls):
+            return value
+        else:
+            raise TypeError(f"Unsupported shape_type type: {type(value)}. Must be str, bool, or ShapeType.")
+
+    def is_dynamic(self):
+        return self == ShapeType.DYNAMIC
+
+    def is_fixed(self):
+        return self == ShapeType.FIXED
+
+ALL_DATATYPES = [
+    DataType('bool', 'Bool', 'false', 'true', False),
+    DataType('uint8_t', 'Uint8', '0', '16', True),
+    DataType('uint16_t', 'Uint16', '0', '256', True),
+    DataType('uint32_t', 'Uint32', '0', '15536', True),
+    DataType('uint64_t', 'Uint64', '0', '1000000', True),
+    DataType('int8_t', 'Int8', '-11', '11', True),
+    DataType('int16_t', 'Int16', '-181', '181', True),
+    DataType('int32_t', 'Int32', '-32761', '32761', True),
+    DataType('int64_t', 'Int64', '-1000000', '1000000', True),
+    DataType('half', 'Float16', 'half(-100.0f)', 'half(100.0f)', False),
+    DataType('float', 'Float32', '-3.4e15', '3.4e15', False),
+    DataType('double', 'Float64', '-1.7e150', '1.7e150', False),
+    DataType('bfloat16', 'Bfloat16', '-1.0e10_bf16', '1.0e10_bf16', False),
+    DataType('float_e4m3_t', 'Float8e4m3', 'float_e4m3_t(-16.0f)', 'float_e4m3_t(16.0f)', False),
+    DataType('float_e5m2_t', 'Float8e5m2', 'float_e5m2_t(-32.0f)', 'float_e5m2_t(32.0f)', False)
+]
+
+class BaseTestGenerator:
+    def __init__(self):
+        self.test_cases = []
+        self.ort_datatype_map = {
+            'bool': 'DataType_BOOL',
+            'uint8_t': 'DataType_UINT8',
+            'uint16_t': 'DataType_UINT16', 
+            'uint32_t': 'DataType_UINT32',
+            'uint64_t': 'DataType_UINT64',
+            'int8_t': 'DataType_INT8',
+            'int16_t': 'DataType_INT16',
+            'int32_t': 'DataType_INT32',
+            'int64_t': 'DataType_INT64',
+            'half': 'DataType_FLOAT16',
+            'float': 'DataType_FLOAT',
+            'double': 'DataType_DOUBLE',
+            'bfloat16': 'DataType_BFLOAT16',
+        }
+
+    def get_unpacked_dims(self, dim_names, unpack_axes) -> List[str]:
+        """Generate dimension expressions for an unpack operation."""
+        output_dims = []
+        ndim = len(dim_names)
+        positive_unpack_axes = [ax if ax >= 0 else ndim + ax for ax in unpack_axes]
+        for i, name in enumerate(dim_names):
+            param = ""
+            if i == positive_unpack_axes[-1]:
+                param = "P"
+            else:
+                param = "4"
+            if i in positive_unpack_axes:
+                output_dims.append(f"{name} * {param}")
+            else:
+                output_dims.append(name)
+        return output_dims
+
+    def generate_shape_init(self, shape_type, dim_spec):
+        shape_type = ShapeType.from_input(shape_type)
+        if shape_type.is_fixed():
+            dim_strs = [f"{d}" for d in dim_spec]
+            return f"ntt::fixed_shape_v<{', '.join(dim_strs)}>"
+        else:  # dynamic
+            dim_strs = [str(d) for d in dim_spec]
+            return f"ntt::make_shape({', '.join(dim_strs)})"
+
+#shape_type: str: "dynamic" or "fixed"
+#shape_type: bool: True (is_dynamic) or False (is_fixed)
+#dim_spec: dim_names(list[str]) or dim_spec(list[int])
+    def generate_tensor_init(self, datatype, shape_type,
+                             dim_spec, continuity,
+                             vector_rank, var_name, name_suffix, P=None, integer_only=False):
+        code = []
+        shape_expr = self.generate_shape_init(shape_type, dim_spec)
+        element_cpp_type = self.get_element_cpp_type(datatype.cpp_type, vector_rank, P)
+
+        if continuity.is_contiguous:
+            code.append(f"auto {var_name} = ntt::make_tensor<{element_cpp_type}>({shape_expr});")
+            allow_zr = "false" if self.is_div_operation() and "rhs" in var_name else "true"
+            integer_only_str = "true" if integer_only else "false"
+            code.append(f"NttTest::init_tensor({var_name}, {datatype.min_val}, {datatype.max_val}, {allow_zr}, {integer_only_str});")
+        else:  # non-contiguous
+            big_dims = dim_spec.copy()
+            dim_to_change = continuity.non_contiguous_dim
+            op = continuity.big_tensor_op
+
+            if dim_to_change is not None and op is not None and dim_to_change < len(big_dims):
+                big_dims[dim_to_change] = f"({big_dims[dim_to_change]}) {op}"
+
+            big_shape_expr = self.generate_shape_init(shape_type, big_dims)
+
+            code.append(f"// Create non-contiguous tensor (on dimension {dim_to_change})")
+            code.append(f"auto big_tensor{name_suffix} = ntt::make_tensor<{element_cpp_type}>({big_shape_expr});")
+            allow_zr = "false" if self.is_div_operation() else "true"
+            integer_only_str = "true" if integer_only else "false"
+            code.append(f"NttTest::init_tensor(big_tensor{name_suffix}, {datatype.min_val}, {datatype.max_val}, {allow_zr}, {integer_only_str});")
+            code.append(f"")
+            code.append(f"auto {var_name} = ntt::make_tensor_view_from_address<{element_cpp_type}>(")
+            code.append(f"    big_tensor{name_suffix}.elements().data(),")
+            code.append(f"    {shape_expr}")
+            code.append(f"    );")
+
+        return code
+
+
+    def generate_demension_constants(self, dim_names, dims, datatype, P):
+        code = []
+        if P is not None:
+            code.append(f"    constexpr size_t P = {P};")
+
+        for i, (name, size) in enumerate(zip(dim_names, dims)):
+           code.append(f"    constexpr size_t {name} = {size};")
+        return code
+
+    def generate_function_name(self, test_suite_prefix, datatype, test_name):
+        code = [f"TEST({test_suite_prefix}_{datatype.name_suffix}, {test_name}) {{"]
+        return code
+
+    def generate_test_prologue(self, test_suite_prefix, datatype, test_name, P, dim_names, dims, pack_axes=None):
+        """generate test function header, constant P and dimension constants"""
+        code = [f"TEST({test_suite_prefix}_{datatype.name_suffix}, {test_name}) {{"]
+        if (P and (pack_axes is not None)) or ("unpack" in test_name):
+            code.append(f"    constexpr size_t P = {P};")
+
+        # define dimension constants
+        for i, (name, size) in enumerate(zip(dim_names, dims)):
+            if pack_axes and (i in pack_axes):
+                vec_param = "P" if i  == pack_axes[-1] else "4"
+                code.append(f"    constexpr size_t {name}_coefficient = {size};")
+                code.append(f"    constexpr size_t {name} = {name}_coefficient * {vec_param};")
+            else:
+                code.append(f"    constexpr size_t {name} = {size};")
+
+        # code.extend([f"    {datatype.cpp_type} min_input = {datatype.min_val};",
+        #              f"    {datatype.cpp_type} max_input = {datatype.max_val};", ""])
+        return code
+
+    # def generate_min_max_constants(self, datatype):
+    #     code = []
+    #     # code.append(f"    {datatype.cpp_type} min_input = {datatype.min_val};")
+    #     # code.append(f"    {datatype.cpp_type} max_input = {datatype.max_val};")
+    #     return code
+
+
+    def generate_copy_to_contiguous_code(self, input_element_type, shape_type, dim_names, input_var_name="ntt_input", output_var_name="continuous_input"):
+        code = []
+        input_dims_expr = [f"{name}" for name in dim_names]
+        code.append("    // Copy to contiguous tensor for ORT reference")
+        code.append(f"    auto {output_var_name} = ntt::make_unique_tensor<{input_element_type}>({self.generate_shape_init(shape_type, input_dims_expr)});")
+        code.append("    ")
+
+        output_var_name = f"*{output_var_name}"
+
+        iter_var_names = ["i", "j", "k", "l", "m"]
+        for i, name in enumerate(dim_names):
+            code.append(f"    {'    ' * i}for (size_t {iter_var_names[i]} = 0; {iter_var_names[i]} < {name}; {iter_var_names[i]}++) {{")
+        indices = [f"{iter_var_names[i]}" for i in range(len(dim_names))]
+        code.append(f"    {'    ' * len(dim_names)}({output_var_name})({', '.join(indices)}) = {input_var_name}({', '.join(indices)});")
+        for i in range(len(dim_names) - 1, -1, -1):
+            code.append(f"    {'    ' * i}}}")
+        code.append("")
+        return code, output_var_name
+
+    def generate_pack_axes_str(self, axes):
+        if len(axes) == 1:
+            return f"ntt::fixed_shape_v<{axes[0]}>"
+        else:
+            return f"ntt::fixed_shape_v<{', '.join(map(str, axes))}>"
+
+    def generate_reference_and_comparison_code(self,
+                                           datatype, continuity, dim_names, shape_type, is_fp8,
+                                           input_element_type,
+                                           output_element_type,
+                                           output_shape_expr,
+                                           ort_ref_code,
+                                           ntt_output_var_name = "ntt_output1",
+                                           ntt_output_var_is_vector = False):
+        code = []
+        ort_input_tensor = "ntt_input"
+        if not continuity.is_contiguous:
+            if is_fp8:
+                ort_input_tensor = "ntt_input_uint8"
+            else:
+                copy_code, ort_input_tensor = self.generate_copy_to_contiguous_code(input_element_type, shape_type, dim_names)
+                code.extend(copy_code)
+        elif is_fp8:
+            ort_input_tensor = "ntt_input_uint8"
+
+        ort_ref = ort_ref_code
+        ort_ref[1] = f"    auto ort_input = NttTest::ntt2ort({ort_input_tensor});"
+        code.extend([f"    {line}" for line in ort_ref])
+        code.append("")
+
+        code.append("    // Compare results")
+        ntt_output_for_comp = ntt_output_var_name
+        if is_fp8:
+            ntt_output_for_comp += "_uint8"
+            output_element_type_uint8 = 'uint8_t'
+            if ntt_output_var_is_vector:
+                output_element_type_uint8 = output_element_type.replace(datatype.cpp_type, 'uint8_t')
+
+            code.append(f"    auto ntt_output2_uint8 = ntt::make_tensor<{output_element_type_uint8}>({output_shape_expr});")
+            code.append(f"    NttTest::ort2ntt(ort_output, ntt_output2_uint8);")
+            code.append(f"    EXPECT_TRUE(NttTest::compare_tensor({ntt_output_for_comp}, ntt_output2_uint8));")
+        else:
+            code.append(f"    auto ntt_output2 = ntt::make_tensor<{output_element_type}>({output_shape_expr});")
+            code.append(f"    NttTest::ort2ntt(ort_output, ntt_output2);")
+            code.append(f"    EXPECT_TRUE(NttTest::compare_tensor({ntt_output_for_comp}, ntt_output2));")
+        
+        code.append("}")
+        code.append("")
+        return code
+
+    def generate_P_constants(self, P_val):
+        code = []
+        code.append(f"    constexpr size_t P = {P_val};")
+        return code
+
+    def generate_header(self):
+        return '''/* Copyright 2019-2024 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nncase/ntt/shape.h"
+#include "nncase/ntt/tensor.h"
+#include "nncase/ntt/tensor_traits.h"
+#include "nncase/ntt/vector.h"
+#include "ntt_test.h"
+#include "ortki_helper.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <nncase/ntt/ntt.h>
+#include <ortki/operators.h>
+
+using namespace nncase;
+using namespace ortki;
+
+'''
+
+    def generate_footer(self):
+        return '''int main(int argc, char *argv[]) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
+'''
+
+    def is_div_operation(self) -> bool:
+        """Override in subclasses to indicate whether current operation is division.
+        Returns True for div operations to disable allow_zr in init_tensor.
+        """
+        return False
+
+    def get_element_cpp_type(self, base_cpp_type: str, vector_rank: int, P: Optional[str]) -> str:
+        """Utility: given primitive cpp type, return the full `ntt::vector<..., ...>` expression.
+        When ``vector_rank == 0`` it just returns the primitive type.
+        When ``vector_rank > 0`` the caller **must** provide ``P`` – the compile-time pack number.
+        P can be a single value or a tuple/list for multi-dimensional vectors.
+        """
+
+        if vector_rank == 0:
+            return base_cpp_type
+        if P is None:
+            raise ValueError("P must be provided when vector_rank > 0")
+        
+        # Handle tuple/list case for multi-dimensional vectors
+        if isinstance(P, (tuple, list)):
+            if len(P) != vector_rank:
+                raise ValueError("Length of P tuple/list must match vector_rank")
+            # Convert each element to string, using "P" for the last element
+            ps_list = []
+            for i, p in enumerate(P):
+                if i == len(P) - 1:
+                    ps_list.append("P" if p is None else str(p))
+                else:
+                    ps_list.append("4" if p is None else str(p))
+            ps = ", ".join(ps_list)
+        else:
+            # Original behavior for single P value
+            if vector_rank == 1:
+                ps = P
+            elif vector_rank > 1:
+                ps = ", ".join([f"4"] * (vector_rank-1)) + ", " + P
+
+        return f"ntt::vector<{base_cpp_type}, {ps}>"
+
+    # -------------------------------------------------------------------------
+    #  High-level code generation helpers that follow the unified test skeleton
+    #  described by the user: NTT-side execution, ORT-side execution, comparison
+    # -------------------------------------------------------------------------
+    def generate_ntt_input_section(self,
+                                   datatype: DataType,
+                                   shape_type: str,
+                                   dims_spec,
+                                   continuity: Continuity,
+                                   vector_rank: int = 0,
+                                   P: Optional[str] = None,
+                                   var_name: str = "ntt_input",
+                                   name_suffix: str = "") -> List[str]:
+        """Generate C++ code for *Step-1* — create NTT input tensor according to
+        1) scalar / vector, 2) contiguous / non-contiguous. The resulting tensor
+        variable will be called ``var_name``.
+        """
+        comment_lines = ["// ------------------------------------------------------------------",
+                         "// 1. create NTT input ",
+                         "// ------------------------------------------------------------------"]
+
+
+        # Re-use the existing, well-tested generate_tensor_init helper
+        body = self.generate_tensor_init(datatype,
+                                         shape_type,
+                                         dims_spec,
+                                         continuity,
+                                         vector_rank,
+                                         var_name,
+                                         name_suffix,
+                                         P)
+        return comment_lines + body + [""]
+
+    def generate_ntt_operation_section(self,
+                                       operation_lines: list[str]) -> list[str]:
+        """Wrap actual NTT kernel call with section comment."""
+        header = ["// ------------------------------------------------------------------",
+                  "// 2. call NTT operation to get NTT output (under test)",
+                  "// ------------------------------------------------------------------"]
+        return header + operation_lines + [""]
+
+    def generate_ntt_output_and_op_section(self,
+                                           datatype: DataType,
+                                           output_shape_expr: str,
+                                           cast_mode: int,
+                                           ntt_op_call_lines: List[str],
+                                           output_var_name: str = "ntt_output1",
+                                           output_element_type = None) -> List[str]:
+        """Generates code for creating NTT output tensor, calling the NTT operation,
+        and handling FP8 casting.
+        """
+        if output_element_type is None:
+            output_element_type = datatype.cpp_type
+
+        output_tensor_code = [
+            f"// Create output tensor",
+            f"auto {output_var_name} = ntt::make_tensor<{output_element_type}>({output_shape_expr});",
+            ""
+        ]
+        op_section = output_tensor_code + ntt_op_call_lines
+        if cast_mode == 1:
+            uint8_type = "uint8_t" if "vector" not in output_element_type else output_element_type.replace(datatype.cpp_type, "uint8_t")
+            op_section.extend([
+                f"auto {output_var_name}_uint8 = ntt::make_tensor<{uint8_type}>({output_shape_expr});",
+                f"NttTest::reinterpret_cast_fp8_to_uint8({output_var_name}, {output_var_name}_uint8);",
+                ""
+            ])
+
+        return self.generate_ntt_operation_section(op_section)
+
+    def prepare_contiguous_input(self, input_name, datatype, vector_rank, vec_param, 
+                                  is_dynamic_shape, dims_spec, continuity):
+        
+        continuity_var_name = input_name
+        element_type = self.get_element_cpp_type(datatype.cpp_type, vector_rank, vec_param)
+        code = []
+        
+        if not continuity.is_contiguous:
+            continuity_var_name = f"{input_name}_contiguous"
+            copy_code, _ = self.generate_copy_to_contiguous_code(
+                element_type,
+                is_dynamic_shape,
+                dims_spec,
+                input_name,
+                continuity_var_name
+            )
+            continuity_var_name = f"*{continuity_var_name}"
+            code.extend(copy_code)
+        
+        return continuity_var_name, code
+
+    def generate_ort_input_section(self,
+                                   datatype: DataType,
+                                   shape_type,
+                                   dims_spec,
+                                   continuity,
+                                   cast_mode: int,
+                                   P: Optional[str] = None,
+                                   vector_rank: int = 0,
+                                   ort_input_var_name: str = "ort_input",
+                                   ntt_input_var_name: str = "ntt_input",
+                                   name_suffix: str = "") -> list[str]:
+        """Generate code for *ORT side* step-1: convert NTT input → ORT input,
+        taking care of contiguous copy and fp8 cast when required."""
+        lines = ["// ------------------------------------------------------------------",
+                 "// 1. build ORT input tensor",
+                 "// ------------------------------------------------------------------"]
+
+        # Decide which NTT tensor will be fed to ortki
+        ort_src_tensor = ntt_input_var_name
+        if cast_mode == 1:
+            # For cast, if ntt input is fp8, first cast to uint8 tensor.
+            # The resulting uint8 tensor is always contiguous.
+            input_shape_expr = self.generate_shape_init(shape_type, dims_spec)
+            uint8_cpp_type = self.get_element_cpp_type("uint8_t", vector_rank, P)
+            lines.append(f"    auto {ntt_input_var_name}_uint8 = ntt::make_tensor<{uint8_cpp_type}>({input_shape_expr});")
+            lines.append(f"    NttTest::reinterpret_cast_fp8_to_uint8({ntt_input_var_name}, {ntt_input_var_name}_uint8);")
+            lines.append(f"")
+            ort_src_tensor = f"{ntt_input_var_name}_uint8"
+        elif not continuity.is_contiguous:
+            # 1.2: if not fp8 and non-contiguous, copy to a contiguous buffer.
+            # For vector types, the element type is a vector.
+            element_cpp_type = self.get_element_cpp_type(datatype.cpp_type, vector_rank, P)
+            shape_expr = self.generate_shape_init(shape_type, dims_spec)
+            lines.append(f"  auto continuous_input{name_suffix} = ntt::make_tensor<{element_cpp_type}>({shape_expr});")
+
+            iter_var_names = ["i", "j", "k", "l", "m"]
+            # nested copy loops
+            lines.append("")
+            for i, iter_end in enumerate(dims_spec):
+                indent = "    " * i
+                lines.append(f"    {indent}for (size_t {iter_var_names[i]} = 0; {iter_var_names[i]} < {iter_end}; {iter_var_names[i]}++) {{")
+            indices = ", ".join([iter_var_names[i] for i in range(len(dims_spec))])
+            lines.append(f"    {'    ' * len(dims_spec)}continuous_input{name_suffix}({indices}) = {ntt_input_var_name}({indices});")
+            for i in range(len(dims_spec) - 1, -1, -1):
+                indent = "    " * i
+                lines.append(f"    {indent}}}")
+            lines.append("")
+            ort_src_tensor = f"continuous_input{name_suffix}"
+
+        # At this point, ort_src_tensor is either:
+        # 1. The original ntt_input (if contiguous and not fp8)
+        # 2. A contiguous copy (if non-contiguous and not fp8)
+        # 3. A uint8-casted tensor (if fp8)
+        lines.append(f"    auto {ort_input_var_name} = NttTest::ntt2ort({ort_src_tensor});")
+        lines.append("")
+        return lines
+
+    def generate_ort_operation_section(self, ort_operation_lines: list[str]) -> list[str]:
+        """Wrap ortki kernel invocation section."""
+        header = ["// ------------------------------------------------------------------",
+                  "// 2. call ortki kernel to generate ORT output",
+                  "// ------------------------------------------------------------------"]
+        return header + ort_operation_lines + [""]
+    
+
+    def generate_ort_back2ntt_and_compare_section(self,
+                                                  datatype: DataType,
+                                                  output_element_cpp_type: str,
+                                                  output_shape_expr: str,
+                                                  cast_mode: int,
+                                                  ntt_output_var_name: str = "ntt_output1",
+                                                  ort_output_var_name: str = "ort_output",
+                                                  ort_type: str = "double") -> list[str]:
+        """Generate code to convert ORT output back to NTT tensor (golden) and
+        compare with tested NTT output."""
+        lines = ["// ------------------------------------------------------------------",
+                 "// 3. convert ORT output back to NTT tensor (golden) and compare with tested NTT output",
+                 "// ------------------------------------------------------------------"]
+        
+        if cast_mode == 0:  #  no cast
+            golden_var_name = "ntt_golden"
+            lines.append(f"auto {golden_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});")
+            lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_var_name});")
+            lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_var_name}, {golden_var_name}));")
+        elif cast_mode == 1:  # fp8 with uint8 comparison
+            ntt_output_to_compare = f"{ntt_output_var_name}_uint8"
+            golden_var_name = "ntt_golden_uint8"
+            golden_cpp_type = "uint8_t" if "vector" not in output_element_cpp_type else output_element_cpp_type.replace(datatype.cpp_type, "uint8_t")
+
+            lines.append(f"auto {golden_var_name} = ntt::make_tensor<{golden_cpp_type}>({output_shape_expr});")
+            lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_var_name});")
+            lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_to_compare}, {golden_var_name}));")
+        elif cast_mode == 2:  #  cast in ntt
+            golden_ntt_in_ort_type_var = f"ntt_golden_{ort_type}"
+            golden_cpp_type = output_element_cpp_type.replace(datatype.cpp_type, ort_type)
+
+            lines.append(f"// Golden output is in ort_type, cast it back to datatype.cpp_type for comparison")
+            lines.append(f"auto {golden_ntt_in_ort_type_var} = ntt::make_unique_tensor<{golden_cpp_type}>({output_shape_expr});")
+            lines.append(f"NttTest::ort2ntt({ort_output_var_name}, *{golden_ntt_in_ort_type_var});")
+
+            golden_signed_int_var = "ntt_golden_signed_int"
+            if datatype.cpp_type in ["uint8_t", "uint16_t", "uint32_t", "uint64_t"]:
+                int_tensor_cpp_type = output_element_cpp_type.replace(datatype.cpp_type, "int64_t")
+                lines.append(f"auto {golden_signed_int_var} = ntt::make_unique_tensor<{int_tensor_cpp_type}>({output_shape_expr});")
+                lines.append(f"ntt::cast(*{golden_ntt_in_ort_type_var}, *{golden_signed_int_var});")
+                golden_cast_source_var = golden_signed_int_var
+            else:
+                golden_cast_source_var = golden_ntt_in_ort_type_var
+            
+            golden_origin_var = "ntt_golden"
+            lines.append(f"auto {golden_origin_var} = ntt::make_unique_tensor<{output_element_cpp_type}>({output_shape_expr});")
+            lines.append(f"ntt::cast(*{golden_cast_source_var}, *{golden_origin_var});")
+
+            lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_var_name}, *{golden_origin_var}));")
+        elif cast_mode == 4:   # cast in ort
+            lines.append(f"EXPECT_TRUE(NttTest::compare_tensor(ntt_golden, ntt_output));")
+
+        lines.append("}")
+        lines.append("")
+        return lines
+
+def get_numeric_value(value_str: str, cpp_type: str) -> float:
+    """Extract numeric value from string representation based on cpp_type"""
+    if cpp_type == 'bool':
+        return 1.0 if value_str == 'true' else 0.0
+    elif cpp_type in ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', 'int8_t', 'int16_t', 'int32_t', 'int64_t']:
+        return float(value_str)
+    elif cpp_type == 'half':
+        # Extract value from half(-100.0f) format
+        if value_str.startswith('half(') and value_str.endswith(')'):
+            inner = value_str[5:-1]  # Remove 'half(' and ')'
+            if inner.endswith('f'):
+                inner = inner[:-1]  # Remove 'f'
+            return float(inner)
+        return float(value_str)
+    elif cpp_type == 'float':
+        if value_str.endswith('f'):
+            return float(value_str[:-1])
+        return float(value_str)
+    elif cpp_type == 'double':
+        return float(value_str)
+    elif cpp_type == 'bfloat16':
+        # Extract value from -1.0e10_bf16 format
+        if value_str.endswith('_bf16'):
+            return float(value_str[:-6])  # Remove '_bf16'
+        return float(value_str)
+    elif cpp_type in ['float_e4m3_t', 'float_e5m2_t']:
+        # Extract value from float_e4m3_t(-16.0f) format
+        if value_str.startswith(cpp_type + '(') and value_str.endswith(')'):
+            inner = value_str[len(cpp_type)+1:-1]  # Remove type and parentheses
+            if inner.endswith('f'):
+                inner = inner[:-1]  # Remove 'f'
+            return float(inner)
+        return float(value_str)
+    else:
+        return float(value_str)
+
+def format_value_for_type(value: float, cpp_type: str) -> str:
+    """Format numeric value back to string representation for given cpp_type"""
+    if cpp_type == 'bool':
+        return 'true' if value > 0.5 else 'false'
+    elif cpp_type in ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', 'int8_t', 'int16_t', 'int32_t', 'int64_t']:
+        return str(int(value))
+    elif cpp_type == 'half':
+        return f'half({value}f)'
+    elif cpp_type == 'float':
+        return f'{value}f'
+    elif cpp_type == 'double':
+        return str(value)
+    elif cpp_type == 'bfloat16':
+        return f'{value}_bf16'
+    elif cpp_type in ['float_e4m3_t', 'float_e5m2_t']:
+        return f'{cpp_type}({value}f)'
+    else:
+        return str(value)
+
+def clamp_value_strings(from_type: DataType, to_type: DataType) -> tuple[str, str]:
+    """
+    Clamp the min/max values of from_type to the range of to_type.
+    Returns (clamped_min_str, clamped_max_str) in the format of to_type.
+    
+    Args:
+        from_type: Source data type with min_val and max_val as strings
+        to_type: Target data type with min_val and max_val as strings
+        
+    Returns:
+        Tuple of (min_value_str, max_value_str) formatted for to_type
+    """
+    # Get numeric values
+    from_min = get_numeric_value(from_type.min_val, from_type.cpp_type)
+    from_max = get_numeric_value(from_type.max_val, from_type.cpp_type)
+    to_min = get_numeric_value(to_type.min_val, to_type.cpp_type)
+    to_max = get_numeric_value(to_type.max_val, to_type.cpp_type)
+    
+    # Clamp the values
+    clamped_min = max(from_min, to_min)
+    clamped_max = min(from_max, to_max)
+    
+    # Format back to strings for to_type
+    clamped_min_str = format_value_for_type(clamped_min, from_type.cpp_type)
+    clamped_max_str = format_value_for_type(clamped_max, from_type.cpp_type)
+    
+    return clamped_min_str, clamped_max_str
+
+def generate_cmake_list(directory, filenames, output_filename, variable_name):
+    """generate a .cmake file that contains the list of generated test files"""
+    cmake_list_path = os.path.join(directory, output_filename)
+    with open(cmake_list_path, "w") as f:
+        f.write(f"# This file is generated automatically. DO NOT EDIT.\n")
+        f.write(f"set({variable_name}\n")
+        for name in filenames:
+            f.write(f"    ${{CMAKE_CURRENT_LIST_DIR}}/{name}\n") # use relative path to current CMakeLists.txt
+        f.write(")\n")
+    print(f"Generated CMake list: {cmake_list_path}")
diff --git a/ntt/test/ctest/test_generator_base.py b/ntt/test/ctest/test_generator_base.py
deleted file mode 100644
index 161bcf30b8..0000000000
--- a/ntt/test/ctest/test_generator_base.py
+++ /dev/null
@@ -1,411 +0,0 @@
-#!/usr/bin/env python3
-"""
-Base classes and utilities for generating NTT test cases.
-"""
-
-import os
-from collections import namedtuple
-from typing import List, Optional
-
-# is_contiguous: bool
-# non_contiguous_dim: int or None
-# big_tensor_op: str or None - How to build the big tensor at given non_contiguous_dim
-Continuity = namedtuple('Continuity', ['is_contiguous', 'non_contiguous_dim', 'big_tensor_op'])
-DataType = namedtuple('DataType', ['cpp_type', 'name_suffix', 'min_val', 'max_val'])
-
-ALL_DATATYPES = [
-    DataType('bool', 'Bool', 'false', 'true'),
-    DataType('uint8_t', 'Uint8', '0', '255'),
-    DataType('uint16_t', 'Uint16', '0', '65535'),
-    DataType('uint32_t', 'Uint32', '0', '100000'),
-    DataType('uint64_t', 'Uint64', '0', '1000000'),
-    DataType('int8_t', 'Int8', '-127', '127'),
-    DataType('int16_t', 'Int16', '-32767', '32767'),
-    DataType('int32_t', 'Int32', '-100000', '100000'),
-    DataType('int64_t', 'Int64', '-1000000', '1000000'),
-    DataType('half', 'Float16', 'half(-65504.0f)', 'half(65504.0f)'),
-    DataType('float', 'Float32', '-3.4e38', '3.4e38'),
-    DataType('double', 'Float64', '-1.7e308', '1.7e308'),
-    DataType('bfloat16', 'Bfloat16', '-3.3e38_bf16', '3.3e38_bf16'),
-    DataType('float_e4m3_t', 'Float8e4m3', 'float_e4m3_t(-448.0f)', 'float_e4m3_t(448.0f)'),
-    DataType('float_e5m2_t', 'Float8e5m2', 'float_e5m2_t(-57344.0f)', 'float_e5m2_t(57344.0f)'),
-]
-
-class BaseTestGenerator:
-    def __init__(self):
-        self.test_cases = []
-
-    def generate_shape_init(self, shape_type, dims):
-        if shape_type == "fixed":
-            dim_strs = [f"{d}" for d in dims]
-            return f"ntt::fixed_shape_v<{', '.join(dim_strs)}>"
-        else:  # dynamic
-            dim_strs = [str(d) for d in dims]
-            return f"ntt::make_shape({', '.join(dim_strs)})"
-
-    def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, vector_rank, P=None, axes_count=1):
-        code = []
-        shape_expr = self.generate_shape_init(shape_type, dims)
-
-        # Determine element type based on vector_rank
-        if vector_rank == 0:
-            element_cpp_type = datatype.cpp_type
-        elif vector_rank > 0:
-            if P is None:
-                raise ValueError("P must be provided for vector_rank > 0")
-            
-            # The rank of the vector is determined by vector_rank.
-            ps = ', '.join([f"P"] * vector_rank)
-            element_cpp_type = f"ntt::vector<{datatype.cpp_type}, {ps}>"
-        else:
-            raise ValueError(f"Invalid vector_rank: {vector_rank}")
-
-        if continuity.is_contiguous:
-            code.append(f"auto {var_name} = ntt::make_tensor<{element_cpp_type}>({shape_expr});")
-            code.append(f"NttTest::init_tensor({var_name}, min_input, max_input);")
-        else:  # non-contiguous
-            big_dims = dims.copy()
-            dim_to_change = continuity.non_contiguous_dim
-            op = continuity.big_tensor_op
-
-            if dim_to_change is not None and op is not None and dim_to_change < len(big_dims):
-                big_dims[dim_to_change] = f"({big_dims[dim_to_change]}) {op}"
-
-            big_shape_expr = self.generate_shape_init(shape_type, big_dims)
-
-            code.append(f"// Create non-contiguous tensor (on dimension {dim_to_change})")
-            code.append(f"auto big_tensor = ntt::make_tensor<{element_cpp_type}>({big_shape_expr});")
-            code.append(f"NttTest::init_tensor(big_tensor, min_input, max_input);")
-            code.append(f"")
-            code.append(f"auto {var_name} = ntt::make_tensor_view_from_address<{element_cpp_type}>(")
-            code.append(f"    big_tensor.elements().data(),")
-            code.append(f"    {shape_expr},")
-            code.append(f"    big_tensor.strides());")
-
-        return code
-
-    def generate_test_prologue(self, test_suite_prefix, datatype, test_name, P, dim_names, dims, axes=None):
-        """generate test function header, constant P and dimension constants"""
-        code = [f"TEST({test_suite_prefix}_{datatype.name_suffix}, {test_name}) {{"]
-        if P:
-            code.append(f"    constexpr size_t P = {P};")
-
-        # define dimension constants
-        for i, (name, size) in enumerate(zip(dim_names, dims)):
-            if axes and i in axes:
-                code.append(f"    constexpr size_t {name}_coefficient = {size};")
-                code.append(f"    constexpr size_t {name} = {name}_coefficient * P;")
-            else:
-                code.append(f"    constexpr size_t {name} = {size};")
-
-        code.extend([f"    {datatype.cpp_type} min_input = {datatype.min_val};",
-                     f"    {datatype.cpp_type} max_input = {datatype.max_val};", ""])
-        return code
-
-    def generate_reference_and_comparison_code(self,
-                                           datatype, continuity, dim_names, shape_type, is_fp8,
-                                           input_element_type,
-                                           output_element_type,
-                                           output_shape_expr,
-                                           ort_ref_code,
-                                           ntt_output_var_name = "ntt_output1",
-                                           ntt_output_var_is_vector = False):
-        code = []
-        input_dims_expr = [f"{name}" for name in dim_names]
-
-        ort_input_tensor = "ntt_input"
-        if not continuity.is_contiguous:
-            if is_fp8:
-                ort_input_tensor = "ntt_input_uint8"
-            else:
-                code.append("    // Copy to contiguous tensor for ORT reference")
-                code.append(f"    auto continuous_input = ntt::make_tensor<{input_element_type}>({self.generate_shape_init(shape_type, input_dims_expr)});")
-                code.append("    ")
-                for i, name in enumerate(dim_names):
-                    code.append(f"    {'    ' * i}for (size_t {name.lower()} = 0; {name.lower()} < {name}; {name.lower()}++) {{")
-                indices = [f"{name.lower()}" for name in dim_names]
-                code.append(f"    {'    ' * len(dim_names)}continuous_input({', '.join(indices)}) = ntt_input({', '.join(indices)});")
-                for i in range(len(dim_names)-1, -1, -1):
-                    code.append(f"    {'    ' * i}}}")
-                code.append("")
-                ort_input_tensor = "continuous_input"
-        elif is_fp8:
-            ort_input_tensor = "ntt_input_uint8"
-
-        ort_ref = ort_ref_code
-        ort_ref[1] = f"    auto ort_input = NttTest::ntt2ort({ort_input_tensor});"
-        code.extend([f"    {line}" for line in ort_ref])
-        code.append("")
-
-        code.append("    // Compare results")
-        ntt_output_for_comp = ntt_output_var_name
-        if is_fp8:
-            ntt_output_for_comp += "_uint8"
-            output_element_type_uint8 = 'uint8_t'
-            if ntt_output_var_is_vector:
-                output_element_type_uint8 = output_element_type.replace(datatype.cpp_type, 'uint8_t')
-
-            code.append(f"    auto ntt_output2_uint8 = ntt::make_tensor<{output_element_type_uint8}>({output_shape_expr});")
-            code.append(f"    NttTest::ort2ntt(ort_output, ntt_output2_uint8);")
-            code.append(f"    EXPECT_TRUE(NttTest::compare_tensor({ntt_output_for_comp}, ntt_output2_uint8));")
-        else:
-            code.append(f"    auto ntt_output2 = ntt::make_tensor<{output_element_type}>({output_shape_expr});")
-            code.append(f"    NttTest::ort2ntt(ort_output, ntt_output2);")
-            code.append(f"    EXPECT_TRUE(NttTest::compare_tensor({ntt_output_for_comp}, ntt_output2));")
-        
-        code.append("}")
-        code.append("")
-        return code
-
-    def generate_header(self):
-        return '''/* Copyright 2019-2024 Canaan Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "nncase/ntt/shape.h"
-#include "nncase/ntt/tensor.h"
-#include "nncase/ntt/tensor_traits.h"
-#include "nncase/ntt/vector.h"
-#include "ntt_test.h"
-#include "ortki_helper.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <nncase/ntt/ntt.h>
-#include <ortki/operators.h>
-
-using namespace nncase;
-using namespace ortki;
-
-'''
-
-    def generate_footer(self):
-        return '''int main(int argc, char *argv[]) {
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-}
-'''
-
-    def _build_vector_cpp_type(self, base_cpp_type: str, vector_rank: int, P: Optional[str], axes_count: Optional[int] = None) -> str:
-        """Utility: given primitive cpp type, return the full `ntt::vector<..., ...>` expression.
-        When ``vector_rank == 0`` it just returns the primitive type.
-        When ``vector_rank > 0`` the caller **must** provide ``P`` – the compile-time vectorize number – and, if ``vector_rank > 1``, also ``axes_count`` (how many axes are vectorized).
-        """
-        if vector_rank == 0:
-            return base_cpp_type
-        if P is None:
-            raise ValueError("P must be provided when vector_rank > 0")
-        if vector_rank == 1:
-            return f"ntt::vector<{base_cpp_type}, {P}>"
-        if axes_count is None:
-            raise ValueError("axes_count must be provided when vector_rank > 1")
-        ps = ", ".join([f"P"] * axes_count)
-        return f"ntt::vector<{base_cpp_type}, {ps}>"
-
-    # -------------------------------------------------------------------------
-    #  High-level code generation helpers that follow the unified test skeleton
-    #  described by the user: NTT-side execution, ORT-side execution, comparison
-    # -------------------------------------------------------------------------
-    def generate_ntt_input_section(self,
-                                   datatype: DataType,
-                                   shape_type: str,
-                                   dim_names: List[str],
-                                   continuity: Continuity,
-                                   vector_rank: int = 0,
-                                   P: Optional[str] = None,
-                                   axes_count: Optional[int] = None,
-                                   var_name: str = "ntt_input") -> List[str]:
-        """Generate C++ code for *Step-1* — create NTT input tensor according to
-        1) scalar / vector, 2) contiguous / non-contiguous. The resulting tensor
-        variable will be called ``var_name``.
-        """
-        comment_lines = ["// ------------------------------------------------------------------",
-                         "// 1. create NTT input ",
-                         "// ------------------------------------------------------------------"]
-
-        dims_expr = [name for name in dim_names]  # use dimension constants
-
-        # Re-use the existing, well-tested generate_tensor_init helper
-        body = self.generate_tensor_init(datatype,
-                                         shape_type,
-                                         dims_expr,
-                                         continuity,
-                                         var_name,
-                                         vector_rank,
-                                         P,
-                                         axes_count)
-        return comment_lines + body + [""]
-
-    def generate_ntt_operation_section(self,
-                                       operation_lines: list[str]) -> list[str]:
-        """Wrap actual NTT kernel call with section comment."""
-        header = ["// ------------------------------------------------------------------",
-                  "// 2. call NTT operation to get NTT output (under test)",
-                  "// ------------------------------------------------------------------"]
-        return header + operation_lines + [""]
-
-    def generate_ntt_output_and_op_section(self,
-                                           datatype: DataType,
-                                           output_shape_expr: str,
-                                           deal_fp8: int,
-                                           ntt_op_call_lines: List[str],
-                                           output_var_name: str = "ntt_output1",
-                                           output_element_type: Optional[str] = None) -> List[str]:
-        """Generates code for creating NTT output tensor, calling the NTT operation,
-        and handling FP8 casting.
-        """
-        if output_element_type is None:
-            output_element_type = datatype.cpp_type
-
-        output_tensor_code = [
-            f"// Create output tensor",
-            f"auto {output_var_name} = ntt::make_tensor<{output_element_type}>({output_shape_expr});",
-            ""
-        ]
-        op_section = output_tensor_code + ntt_op_call_lines
-        if deal_fp8 == 1:
-            uint8_type = "uint8_t" if "vector" not in output_element_type else output_element_type.replace(datatype.cpp_type, "uint8_t")
-            op_section.extend([
-                f"auto {output_var_name}_uint8 = ntt::make_tensor<{uint8_type}>({output_shape_expr});",
-                f"NttTest::reinterpret_cast_fp8_to_uint8({output_var_name}, {output_var_name}_uint8);",
-                ""
-            ])
-
-        return self.generate_ntt_operation_section(op_section)
-
-    def generate_ort_input_section(self,
-                                   datatype: DataType,
-                                   shape_type: str,
-                                   dim_names: list[str],
-                                   continuity: Continuity,
-                                   deal_fp8: int,
-                                   P: Optional[str] = None,
-                                   vector_rank: int = 0,
-                                   axes_count: Optional[int] = None,
-                                   ort_input_var_name: str = "ort_input",
-                                   ntt_input_var_name: str = "ntt_input") -> list[str]:
-        """Generate code for *ORT side* step-1: convert NTT input → ORT input,
-        taking care of contiguous copy and fp8 cast when required."""
-        lines = ["// ------------------------------------------------------------------",
-                 "// 1. build ORT input tensor",
-                 "// ------------------------------------------------------------------"]
-
-        input_dims_expr = [name for name in dim_names]
-
-        # Decide which NTT tensor will be fed to ortki
-        ort_src_tensor = ntt_input_var_name
-        if deal_fp8 == 1:
-            # 1.3: if ntt input is fp8, first cast to uint8 tensor.
-            # The resulting uint8 tensor is always contiguous.
-            input_shape_expr = self.generate_shape_init(shape_type, input_dims_expr)
-            uint8_cpp_type = self._build_vector_cpp_type("uint8_t", vector_rank, P, axes_count)
-            lines.append(f"    auto {ntt_input_var_name}_uint8 = ntt::make_tensor<{uint8_cpp_type}>({input_shape_expr});")
-            lines.append(f"    NttTest::reinterpret_cast_fp8_to_uint8({ntt_input_var_name}, {ntt_input_var_name}_uint8);")
-            lines.append(f"")
-            ort_src_tensor = f"{ntt_input_var_name}_uint8"
-        elif deal_fp8 == 2:
-            input_shape_expr = self.generate_shape_init(shape_type, input_dims_expr)
-            fp16_cpp_type = self._build_vector_cpp_type("half", vector_rank, P, axes_count)
-            lines.append(f"    // Cast fp8 input to fp16 for ORT reference computation")
-            lines.append(f"    auto {ntt_input_var_name}_fp16 = ntt::make_tensor<{fp16_cpp_type}>({input_shape_expr});")
-            lines.append(f"    ntt::cast({ntt_input_var_name}, {ntt_input_var_name}_fp16);")
-            lines.append(f"")
-            ort_src_tensor = f"{ntt_input_var_name}_fp16"
-        elif not continuity.is_contiguous:
-            # 1.2: if not fp8 and non-contiguous, copy to a contiguous buffer.
-            # For vector types, the element type is a vector.
-            element_cpp_type = self._build_vector_cpp_type(datatype.cpp_type, vector_rank, P, axes_count)
-            shape_expr = self.generate_shape_init(shape_type, input_dims_expr)
-            lines.append(f"    auto continuous_input = ntt::make_tensor<{element_cpp_type}>({shape_expr});")
-
-            # nested copy loops
-            lines.append("")
-            for i, name in enumerate(dim_names):
-                indent = "    " * i
-                lines.append(f"    {indent}for (size_t {name.lower()} = 0; {name.lower()} < {name}; {name.lower()}++) {{")
-            indices = ", ".join([n.lower() for n in dim_names])
-            lines.append(f"    {'    ' * len(dim_names)}continuous_input({indices}) = {ntt_input_var_name}({indices});")
-            for i in range(len(dim_names) - 1, -1, -1):
-                indent = "    " * i
-                lines.append(f"    {indent}}}")
-            lines.append("")
-            ort_src_tensor = "continuous_input"
-
-        # At this point, ort_src_tensor is either:
-        # 1. The original ntt_input (if contiguous and not fp8)
-        # 2. A contiguous copy (if non-contiguous and not fp8)
-        # 3. A uint8-casted tensor (if fp8)
-        lines.append(f"    auto {ort_input_var_name} = NttTest::ntt2ort({ort_src_tensor});")
-        lines.append("")
-        return lines
-
-    def generate_ort_operation_section(self, ort_operation_lines: list[str]) -> list[str]:
-        """Wrap ortki kernel invocation section."""
-        header = ["// ------------------------------------------------------------------",
-                  "// 2. call ortki kernel to generate ORT output",
-                  "// ------------------------------------------------------------------"]
-        return header + ort_operation_lines + [""]
-
-    def generate_ort_back2ntt_and_compare_section(self,
-                                                  datatype: DataType,
-                                                  output_element_cpp_type: str,
-                                                  output_shape_expr: str,
-                                                  deal_fp8: int,
-                                                  ntt_output_var_name: str = "ntt_output1",
-                                                  ort_output_var_name: str = "ort_output") -> list[str]:
-        """Generate code to convert ORT output back to NTT tensor (golden) and
-        compare with tested NTT output."""
-        lines = ["// ------------------------------------------------------------------",
-                 "// 3. convert ORT output back to NTT tensor (golden) and compare with tested NTT output",
-                 "// ------------------------------------------------------------------"]
-        
-        if deal_fp8 == 0:  # Not fp8
-            golden_var_name = "ntt_golden"
-            lines.append(f"auto {golden_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});")
-            lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_var_name});")
-            lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_var_name}, {golden_var_name}));")
-        elif deal_fp8 == 1:  # fp8 with uint8 comparison
-            ntt_output_to_compare = f"{ntt_output_var_name}_uint8"
-            golden_var_name = "ntt_golden_uint8"
-            golden_cpp_type = "uint8_t" if "vector" not in output_element_cpp_type else output_element_cpp_type.replace(datatype.cpp_type, "uint8_t")
-
-            lines.append(f"auto {golden_var_name} = ntt::make_tensor<{golden_cpp_type}>({output_shape_expr});")
-            lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_var_name});")
-            lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_to_compare}, {golden_var_name}));")
-        elif deal_fp8 == 2:  # fp8 with fp16 intermediate, compare fp8
-            golden_fp16_var_name = "ntt_golden_fp16"
-            golden_fp16_cpp_type = output_element_cpp_type.replace(datatype.cpp_type, "half")
-            
-            lines.append(f"// Golden output is in fp16, cast it back to fp8 for comparison")
-            lines.append(f"auto {golden_fp16_var_name} = ntt::make_tensor<{golden_fp16_cpp_type}>({output_shape_expr});")
-            lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_fp16_var_name});")
-
-            golden_fp8_var_name = "ntt_golden_fp8"
-            lines.append(f"auto {golden_fp8_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});")
-            lines.append(f"ntt::cast({golden_fp16_var_name}, {golden_fp8_var_name});")
-
-            lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_var_name}, {golden_fp8_var_name}));")
-
-        lines.append("}")
-        lines.append("")
-        return lines
-
-def generate_cmake_list(directory, filenames, output_filename, variable_name):
-    """generate a .cmake file that contains the list of generated test files"""
-    cmake_list_path = os.path.join(directory, output_filename)
-    with open(cmake_list_path, "w") as f:
-        f.write(f"# This file is generated automatically. DO NOT EDIT.\n")
-        f.write(f"set({variable_name}\n")
-        for name in filenames:
-            f.write(f"    ${{CMAKE_CURRENT_LIST_DIR}}/{name}\n") # use relative path to current CMakeLists.txt
-        f.write(")\n")
-    print(f"Generated CMake list: {cmake_list_path}")
diff --git a/ntt/test/ctest/test_ntt_binary_add.cpp b/ntt/test/ctest/test_ntt_binary_add.cpp
deleted file mode 100644
index 13be5f53a8..0000000000
--- a/ntt/test/ctest/test_ntt_binary_add.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2019-2024 Canaan Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "test_ntt_binary.h"
-
-//test case combination:
-// 1. lhs/rhs
-// 2. dynamic/fixed
-// 3. lhs broadcast to rhs, rhs broadcast to lhs
-// 4. scalar/vector
-
-
-TEST(BinaryTestAddFloat, fixed_fixed_fixed_broadcast_lhs_vector) {
-    // init
-    auto ntt_tensor_lhs =  make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<1>);
-    NttTest::init_tensor(ntt_tensor_lhs, -10.f, 10.f);
-
-    auto ntt_tensor_rhs =  make_tensor<float>(ntt::fixed_shape_v<1, 3, 1, 16>);
-    NttTest::init_tensor(ntt_tensor_rhs, -10.f, 10.f);
-
-    // ntt
-    auto ntt_output1 = make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<1, 3, 1, 16>);
-    ntt::binary<ntt::ops::add>(ntt_tensor_lhs, ntt_tensor_rhs, ntt_output1);
-
-    // // if mxn tensor-of-vector<v> op mxn tensor-of-scalar, 
-    // //broadcast the ntt mxn tensor-of-scalar to ort mxnxv tensor-of-scalar
-
-    // // ort
-    auto [ort_lhs, ort_rhs] = NttTest::convert_and_align_to_ort(ntt_tensor_lhs, ntt_tensor_rhs);
-    auto ort_output = ortki_Add(ort_lhs, ort_rhs);
-    // ortki_Add(ort_lhs, ort_rhs);
-    // // compare
-    auto ntt_output2 = make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<1, 3, 1, 16>);
-    NttTest::ort2ntt(ort_output, ntt_output2);
-    EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_output2));
-
-}
-
-// //fixed fixed fixed group, for demonstrate the basic test macro
-// GENERATE_BINARY_TEST(BinaryTestAddFloat, fixed_fixed_fixed_normal,  
-//                             (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<1, 3, 16, 16>),
-//                            float, add, Add) 
-
-// GENERATE_BINARY_TEST(BinaryTestAddFloat, fixed_fixed_fixed_broadcast_lhs_scalar,  
-//                             (fixed_shape_v<1>), (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<1, 3, 16, 16>),
-//                            float, add, Add) 
-
-// GENERATE_BINARY_TEST(BinaryTestAddFloat, fixed_fixed_fixed_broadcast_rhs_scalar,  
-//                             (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<1>), (fixed_shape_v<1, 3, 16, 16>),
-//                            float, add, Add) 
-
-// GENERATE_BINARY_TEST(BinaryTestAddFloat, fixed_fixed_fixed_broadcast_lhs_vector,  
-//                             (fixed_shape_v<16>), (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<1, 3, 16, 16>),
-//                            float, add, Add) 
-
-// GENERATE_BINARY_TEST(BinaryTestAddFloat, fixed_fixed_fixed_broadcast_rhs_vector,  
-//                             (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<16>), (fixed_shape_v<1, 3, 16, 16>),
-//                            float, add, Add) 
-
-// GENERATE_BINARY_TEST(BinaryTestAddFloat, fixed_fixed_fixed_broadcast_multidirectional,  
-//                             (fixed_shape_v<1, 3, 1, 16>), (fixed_shape_v<3, 1, 16, 1>), (fixed_shape_v<3, 3, 16, 16>),
-//                            float, add, Add) 
-
-// //fixed dynamic dynamic group(with default shape)
-// GENERATE_BINARY_TEST_GROUP(BinaryTestAddFloat, fixed, dynamic,dynamic,  
-//                            float, add, Add) 
-// //dynamic fixed dynamic group
-// GENERATE_BINARY_TEST_GROUP(BinaryTestAddFloat, dynamic, fixed, dynamic,  
-//                            float, add, Add) 
-// //dynamic dynamic dynamic group
-// GENERATE_BINARY_TEST_GROUP(BinaryTestAddFloat, dynamic ,dynamic,dynamic,  
-//                            float, add, Add) 
-                           
-
-
-// DEFINE_test_vector(add, Add)
-// TEST(BinaryTestAddFloat, vector) {                                        
-//     TEST_VECTOR(float)                                                    
-//     TEST_VECTOR(int32_t)                                                  
-//     TEST_VECTOR(int64_t)                                                  
-// }                                                                          
-
-int main(int argc, char *argv[]) {                                         
-    ::testing::InitGoogleTest(&argc, argv);                                
-    return RUN_ALL_TESTS();                                                
-}
-
diff --git a/ntt/test/ctest/test_ntt_cast.cpp b/ntt/test/ctest/test_ntt_cast.cpp
index fe164fbaf3..28b5a1d5cd 100644
--- a/ntt/test/ctest/test_ntt_cast.cpp
+++ b/ntt/test/ctest/test_ntt_cast.cpp
@@ -446,21 +446,21 @@ TEST(CastTestFloat32ToFloat8E4M3, NoVectorize) {
     constexpr size_t N = 32;
     float min_input = -500.0f;
     float max_input = 500.0f;
-
+//#  generate ntt output to test
     // init
     auto ntt_input = ntt::make_tensor<float>(ntt::fixed_shape_v<M, N>);
     NttTest::init_tensor(ntt_input, min_input, max_input);
 
     // ntt
     auto ntt_output1 = ntt::make_tensor<float_e4m3_t>(ntt::fixed_shape_v<M, N>);
-    ntt::cast(ntt_input, ntt_output1, ntt::fixed_shape_v<>);
-
+    ntt::cast(ntt_input, ntt_output1);
+//# generate_ort_golden_output
     // float8
     auto ntt_output2 = ntt::make_tensor<float_e4m3_t>(ntt::fixed_shape_v<M, N>);
     nncase::ntt::apply(ntt_input.shape(), [&](auto index) {
         (ntt_output2)(index) = (float_e4m3_t)(ntt_input)(index);
     });
-
+//# compare
     // compare
     EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_output2));
 }
@@ -471,7 +471,7 @@ TEST(CastTestFloat32ToFloat8E4M3, Vectorize) {
     constexpr size_t P = NTT_VLEN / (sizeof(float) * 8);
     float min_input = -500.0f;
     float max_input = 500.0f;
-
+//#  generate ntt output to test
     // init
     auto ntt_input = ntt::make_tensor<float>(ntt::fixed_shape_v<M, N>);
     NttTest::init_tensor(ntt_input, min_input, max_input);
@@ -486,12 +486,13 @@ TEST(CastTestFloat32ToFloat8E4M3, Vectorize) {
     auto ntt_output1 = ntt::make_tensor<float_e4m3_t>(ntt::fixed_shape_v<M, N>);
     ntt::unpack(vectorize_output, ntt_output1, ntt::fixed_shape_v<0>);
 
+//# generate_ort_golden_output
     // float8
     auto ntt_output2 = ntt::make_tensor<float_e4m3_t>(ntt::fixed_shape_v<M, N>);
     nncase::ntt::apply(ntt_input.shape(), [&](auto index) {
         (ntt_output2)(index) = float_e4m3_t((ntt_input)(index));
     });
-
+//# compare
     // compare
     EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_output2));
 }
diff --git a/ntt/test/ctest/test_ntt_playground.cpp b/ntt/test/ctest/test_ntt_playground.cpp
new file mode 100644
index 0000000000..e780ef52c8
--- /dev/null
+++ b/ntt/test/ctest/test_ntt_playground.cpp
@@ -0,0 +1,261 @@
+/* Copyright 2019-2024 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "test_ntt_binary.h"
+#include <nncase/half.h>
+
+//test case combination:
+// 1. lhs/rhs
+// 2. dynamic/fixed
+// 3. lhs broadcast to rhs, rhs broadcast to lhs
+// 3.1. 1 dim broadcast
+// 3.2. 2 dims broadcast
+// 4. scalar/vector/2d vector
+// 5. tensor/ view
+
+// TEST(BinaryTestAddint, fixed_fixed_fixed_broadcast_lhs_vector) {
+//     // init
+//     auto ntt_tensor_lhs =  make_tensor<ntt::vector<int, 8>>(ntt::fixed_shape_v<1>);
+//     NttTest::init_tensor(ntt_tensor_lhs, -10, 10);
+
+//     auto ntt_tensor_rhs =  make_tensor<int>(ntt::fixed_shape_v<1, 3, 1, 16>);
+//     NttTest::init_tensor(ntt_tensor_rhs, -10, 10);
+
+//     // ntt
+//     auto ntt_output1 = make_tensor<ntt::vector<int, 8>>(ntt::fixed_shape_v<1, 3, 1, 16>);
+//     ntt::binary<ntt::ops::add>(ntt_tensor_lhs, ntt_tensor_rhs, ntt_output1);
+
+//     // // if mxn tensor-of-vector<v> op mxn tensor-of-scalar, 
+//     // //broadcast the ntt mxn tensor-of-scalar to ort mxnxv tensor-of-scalar
+
+//     // // ort
+//     auto [ort_input_lhs, ort_input_rhs] = NttTest::convert_and_align_to_ort(ntt_tensor_lhs, ntt_tensor_rhs);
+//     auto ort_output = ortki_Add(ort_input_lhs, ort_input_rhs);
+//     // ortki_Add(ort_input_lhs, ort_input_rhs);
+//     // // compare
+//     auto ntt_output2 = make_tensor<ntt::vector<uint32_t, 8>>(ntt::fixed_shape_v<1, 3, 1, 16>);
+//     NttTest::ort2ntt(ort_output, ntt_output2);
+//     EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_output2));
+
+// }
+
+
+
+TEST(BinaryTestAddint, fixed_fixed_fixed_broadcast_lhs_1D_vector_rhs_2D_vector) {
+    // init
+    auto ntt_tensor_lhs =  make_tensor<ntt::vector<double, 8>>(ntt::fixed_shape_v<1>);
+    NttTest::init_tensor(ntt_tensor_lhs, 0, 100000);
+
+    auto ntt_tensor_rhs =  make_tensor<ntt::vector<double, 8>>(ntt::fixed_shape_v<1>);
+    NttTest::init_tensor(ntt_tensor_rhs, 0, 100000000);
+
+    // ntt
+    auto ntt_output1 = make_tensor<ntt::vector<double, 8>>(ntt::fixed_shape_v<1>);
+    ntt::binary<ntt::ops::pow>(ntt_tensor_lhs, ntt_tensor_rhs, ntt_output1);
+
+    // // if mxn tensor-of-vector<v> op mxn tensor-of-scalar, 
+    // //broadcast the ntt mxn tensor-of-scalar to ort mxnxv tensor-of-scalar
+
+    // // ort
+
+    auto [ort_input_lhs, ort_input_rhs] = NttTest::convert_and_align_to_ort(ntt_tensor_lhs, ntt_tensor_rhs);
+    // auto ntt_max = make_tensor<float>(ntt::fixed_shape_v<1>);
+    // ntt_max(0) = 2.40614e+38;
+    // auto ort_max = NttTest::ntt2ort(ntt_max);
+
+
+    // auto ntt_zero = make_tensor<float>(ntt::fixed_shape_v<1>);
+    // ntt_zero(0) = 0.0f;
+    // auto ort_zero = NttTest::ntt2ort(ntt_zero);
+
+    // auto ort_output = ortki_Div(ortki_Add(ortki_Add(ort_input_rhs,ort_neg1), ort_input_lhs), ort_input_rhs);
+    // const size_t num_inputs = 2;
+    // ortki::OrtKITensor* input_tensors[num_inputs];
+    // input_tensors[0] = ort_input_lhs;
+    // input_tensors[1] = ort_input_rhs;
+    // auto ort_output = ortki_Min(input_tensors, num_inputs);
+    // auto ort_output = ortki_Clip(ortki_Pow(ort_input_lhs, ort_input_rhs), ort_zero, ort_max);
+    auto ort_output = ortki_Pow(ort_input_lhs, ort_input_rhs);
+
+    // // compare
+    auto ntt_output2 = make_tensor<ntt::vector<double, 8>>(ntt::fixed_shape_v<1>);
+    NttTest::ort2ntt(ort_output, ntt_output2);
+    EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_output2));
+
+}
+
+TEST(BinaryTestAddint, are_close_fp16_behavior) {
+    using namespace nncase;
+    
+    // Test specific fp16 cases that are returning false unexpectedly
+    std::cout << "Testing are_close behavior for fp16 (half) values:" << std::endl;
+    
+    // Test case 1: lhs = 922, rhs = 922.5
+    {
+        half lhs(5.1875f);
+        half rhs(5.16406);
+        
+        // Test are_close directly on half types
+        bool result = NttTest::are_close(lhs, rhs);
+        
+        // Also test float conversion for comparison
+        float lhs_f = static_cast<float>(lhs);
+        float rhs_f = static_cast<float>(rhs);
+        std::cout << "Test 1 - lhs: " << lhs_f << " (fp16:5.1875), rhs: " << rhs_f << " (fp16:5.16406)" 
+                  << ", are_close result: " << (result ? "true" : "false")
+                  << ", diff: " << std::abs(lhs_f - rhs_f) << std::endl;
+    }
+    
+    // Test case 2: lhs = -59.1875, rhs = -59.2188
+    {
+        half lhs(-59.1875f);
+        half rhs(-59.2188f);
+        
+        bool result = NttTest::are_close(lhs, rhs);
+        
+        float lhs_f = static_cast<float>(lhs);
+        float rhs_f = static_cast<float>(rhs);
+        std::cout << "Test 2 - lhs: " << lhs_f << " (fp16: -59.1875), rhs: " << rhs_f << " (fp16: -59.2188)" 
+                  << ", are_close result: " << (result ? "true" : "false")
+                  << ", diff: " << std::abs(lhs_f - rhs_f) << std::endl;
+    }
+    
+    // Test case 3: lhs = -7192, rhs = -7196
+    {
+        half lhs(-7192.0f);
+        half rhs(-7196.0f);
+        
+        bool result = NttTest::are_close(lhs, rhs);
+        
+        float lhs_f = static_cast<float>(lhs);
+        float rhs_f = static_cast<float>(rhs);
+        std::cout << "Test 3 - lhs: " << lhs_f << " (fp16: -7192), rhs: " << rhs_f << " (fp16: -7196)" 
+                  << ", are_close result: " << (result ? "true" : "false")
+                  << ", diff: " << std::abs(lhs_f - rhs_f) << std::endl;
+    }
+    
+    // Test case 4: lhs = 6996, rhs = 6992
+    {
+        half lhs(6996.0f);
+        half rhs(6992.0f);
+        
+        bool result = NttTest::are_close(lhs, rhs);
+        
+        float lhs_f = static_cast<float>(lhs);
+        float rhs_f = static_cast<float>(rhs);
+        std::cout << "Test 4 - lhs: " << lhs_f << " (fp16: 6996), rhs: " << rhs_f << " (fp16: 6992)" 
+                  << ", are_close result: " << (result ? "true" : "false")
+                  << ", diff: " << std::abs(lhs_f - rhs_f) << std::endl;
+    }
+    
+    // Test with different tolerance values for fp16
+    {
+        half lhs(922.0f);
+        half rhs(922.5f);
+        
+        bool result_default = NttTest::are_close(lhs, rhs);
+        bool result_loose = NttTest::are_close(lhs, rhs, 1.0, 1e-3);  // More loose tolerance for fp16
+        bool result_tight = NttTest::are_close(lhs, rhs, 1e-12, 1e-9); // Tighter tolerance
+        
+        float lhs_f = static_cast<float>(lhs);
+        float rhs_f = static_cast<float>(rhs);
+        
+        std::cout << "\nTolerance test for fp16 - lhs: " << lhs_f << ", rhs: " << rhs_f << std::endl;
+        std::cout << "  Default tolerance (1e-9, 1e-5): " << (result_default ? "true" : "false") << std::endl;
+        std::cout << "  Loose tolerance (1.0, 1e-3): " << (result_loose ? "true" : "false") << std::endl;
+        std::cout << "  Tight tolerance (1e-12, 1e-9): " << (result_tight ? "true" : "false") << std::endl;
+        
+        // Calculate what the actual tolerance check values are
+        double abs_diff = std::abs(lhs_f - rhs_f);
+        double rel_tol_default = 1e-5 * std::max(std::abs(lhs_f), std::abs(rhs_f));
+        double abs_tol_default = 1e-9;
+        double threshold_default = std::max(abs_tol_default, rel_tol_default);
+        
+        std::cout << "  Absolute difference: " << abs_diff << std::endl;
+        std::cout << "  Default relative tolerance: " << rel_tol_default << std::endl;
+        std::cout << "  Default absolute tolerance: " << abs_tol_default << std::endl;
+        std::cout << "  Default threshold: " << threshold_default << std::endl;
+        std::cout << "  Passes default threshold: " << (abs_diff <= threshold_default ? "true" : "false") << std::endl;
+    }
+    
+    // Test fp16 precision limitations
+    {
+        std::cout << "\n=== FP16 Precision Analysis ===" << std::endl;
+        
+        // Show actual fp16 values after conversion
+        half h1(922.0f);
+        half h2(922.5f);
+        float f1 = static_cast<float>(h1);
+        float f2 = static_cast<float>(h2);
+        
+        std::cout << "Input: 922.0 -> fp16 -> float: " << f1 << std::endl;
+        std::cout << "Input: 922.5 -> fp16 -> float: " << f2 << std::endl;
+        std::cout << "Difference after fp16 conversion: " << std::abs(f1 - f2) << std::endl;
+        
+        // Show raw fp16 values
+        std::cout << "Raw fp16 value for 922.0: 0x" << std::hex << h1.raw() << std::dec << std::endl;
+        std::cout << "Raw fp16 value for 922.5: 0x" << std::hex << h2.raw() << std::dec << std::endl;
+    }
+}
+
+
+// //fixed fixed fixed group, for demonstrate the basic test macro
+// GENERATE_BINARY_TEST(BinaryTestAddint, fixed_fixed_fixed_normal,  
+//                             (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<1, 3, 16, 16>),
+//                            int, add, Add) 
+
+// GENERATE_BINARY_TEST(BinaryTestAddint, fixed_fixed_fixed_broadcast_lhs_scalar,  
+//                             (fixed_shape_v<1>), (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<1, 3, 16, 16>),
+//                            int, add, Add) 
+
+// GENERATE_BINARY_TEST(BinaryTestAddint, fixed_fixed_fixed_broadcast_rhs_scalar,  
+//                             (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<1>), (fixed_shape_v<1, 3, 16, 16>),
+//                            int, add, Add) 
+
+// GENERATE_BINARY_TEST(BinaryTestAddint, fixed_fixed_fixed_broadcast_lhs_vector,  
+//                             (fixed_shape_v<16>), (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<1, 3, 16, 16>),
+//                            int, add, Add) 
+
+// GENERATE_BINARY_TEST(BinaryTestAddint, fixed_fixed_fixed_broadcast_rhs_vector,  
+//                             (fixed_shape_v<1, 3, 16, 16>), (fixed_shape_v<16>), (fixed_shape_v<1, 3, 16, 16>),
+//                            int, add, Add) 
+
+// GENERATE_BINARY_TEST(BinaryTestAddint, fixed_fixed_fixed_broadcast_multidirectional,  
+//                             (fixed_shape_v<1, 3, 1, 16>), (fixed_shape_v<3, 1, 16, 1>), (fixed_shape_v<3, 3, 16, 16>),
+//                            int, add, Add) 
+
+// //fixed dynamic dynamic group(with default shape)
+// GENERATE_BINARY_TEST_GROUP(BinaryTestAddint, fixed, dynamic,dynamic,  
+//                            int, add, Add) 
+// //dynamic fixed dynamic group
+// GENERATE_BINARY_TEST_GROUP(BinaryTestAddint, dynamic, fixed, dynamic,  
+//                            int, add, Add) 
+// //dynamic dynamic dynamic group
+// GENERATE_BINARY_TEST_GROUP(BinaryTestAddint, dynamic ,dynamic,dynamic,  
+//                            int, add, Add) 
+                           
+
+
+// DEFINE_test_vector(add, Add)
+// TEST(BinaryTestAddint, vector) {                                        
+//     TEST_VECTOR(int)                                                    
+//     TEST_VECTOR(int32_t)                                                  
+//     TEST_VECTOR(int64_t)                                                  
+// }                                                                          
+
+int main(int argc, char *argv[]) {                                         
+    ::testing::InitGoogleTest(&argc, argv);                                
+    return RUN_ALL_TESTS();                                                
+}
+
diff --git a/ntt/test/ntt_test.h b/ntt/test/ntt_test.h
index bdaa6aa96c..31c899c7d7 100644
--- a/ntt/test/ntt_test.h
+++ b/ntt/test/ntt_test.h
@@ -23,6 +23,7 @@
 #include <random>
 #include <string>
 #include <type_traits>
+#include <stdio.h>
 
 #ifdef __AVX2__
 #ifdef _WIN32
@@ -85,107 +86,193 @@ __inline__ uint64_t get_cpu_cycle(void) {
 template <ntt::TensorOrVector TTensor>
 void print_tensor(TTensor &tensor, std::string name);
 
+template <typename T, TensorOrVector TTensor> 
+void generate_random_tensor([[maybe_unused]] TTensor &tensor, [[maybe_unused]] std::mt19937 &gen, [[maybe_unused]] T start = static_cast<T>(0),
+                 [[maybe_unused]] T stop = static_cast<T>(1)) {
+    std::cerr << __FUNCTION__ << ": unsupported data type" << std::endl;
+    std::abort();
+}
+
+template <typename T, TensorOrVector TTensor> 
+requires(std::is_integral_v<T> && !std::is_same_v<T, bool>)
+void generate_random_tensor(TTensor &tensor, std::mt19937 &gen, T start = static_cast<T>(0),
+                 T stop = static_cast<T>(1), bool allow_zr = true, [[maybe_unused]] bool only_int = true) {
+    std::uniform_int_distribution<int64_t> dis(start, stop);
+    ntt::apply(tensor.shape(), [&](auto &index) {
+        if (allow_zr) {
+            tensor(index) = static_cast<T>(dis(gen));
+        } else {
+            do {
+                tensor(index) = static_cast<T>(dis(gen));
+                // std::cout << tensor(index) << std::endl;
+            } while (tensor(index) == static_cast<T>(0));
+        }
+    });
+}
+
 template <typename T, TensorOrVector TTensor>
-void init_tensor(TTensor &tensor, T start = static_cast<T>(0),
-                 T stop = static_cast<T>(1)) {
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    if constexpr (std::is_same_v<T, float_e4m3_t>) {
-        std::uniform_real_distribution<float> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<float_e4m3_t>(dis(gen));
-        });
-    } else if constexpr (std::is_same_v<T, float_e5m2_t>) {
-        std::uniform_real_distribution<float> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<float_e5m2_t>(dis(gen));
-        });
-    } else if constexpr (std::is_same_v<T, int8_t>) {
-        std::uniform_int_distribution<> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<int8_t>(dis(gen));
-        });
-    } else if constexpr (std::is_same_v<T, int16_t>) {
-        std::uniform_int_distribution<> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<int16_t>(dis(gen));
-        });
-    } else if constexpr (std::is_same_v<T, int32_t>) {
-        std::uniform_int_distribution<> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<int32_t>(dis(gen));
-            // std::cout << "index(";
-            // for (size_t i = 0; i < index.rank(); i++)
-            //     std::cout << index[i] << " ";
-            // std::cout << ") = " << tensor(index) << std::endl;
-        });
-    } else if constexpr (std::is_same_v<T, int64_t>) {
-        std::uniform_int_distribution<> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<int64_t>(dis(gen));
-        });
-    } else if constexpr (std::is_same_v<T, uint8_t>) {
-        std::uniform_int_distribution<> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<uint8_t>(dis(gen));
-        });
-    } else if constexpr (std::is_same_v<T, uint16_t>) {
-        std::uniform_int_distribution<> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<uint16_t>(dis(gen));
-        });
-    } else if constexpr (std::is_same_v<T, uint32_t>) {
-        std::uniform_int_distribution<> dis(start, stop);
+requires(std::is_floating_point_v<T>)
+void generate_random_tensor(TTensor &tensor, std::mt19937 &gen, T start = static_cast<T>(0),
+                            T stop = static_cast<T>(1), bool allow_zr = true, bool only_int = false) {
+    
+    auto fill_with_distribution = [&](auto &distribution) {
         ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<uint32_t>(dis(gen));
-        });
-    } else if constexpr (std::is_same_v<T, uint64_t>) {
-        std::uniform_int_distribution<> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<uint64_t>(dis(gen));
-        });
-    } else if constexpr (std::is_same_v<T, float>) {
-        std::uniform_real_distribution<float> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<float>(dis(gen));
-            // std::cout << "index(";
-            // for (size_t i = 0; i < index.rank(); i++)
-            //     std::cout << index[i] << " ";
-            // std::cout << ") = " << tensor(index) << std::endl;
-        });
-    } else if constexpr (std::is_same_v<T, half>) {
-        std::uniform_real_distribution<float> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<half>(dis(gen));
+            if (allow_zr) {
+                tensor(index) = static_cast<T>(distribution(gen));
+            } else {
+                T value;
+                do {
+                    value = static_cast<T>(distribution(gen));
+                } while (value == static_cast<T>(0));
+                tensor(index) = value;
+            }
         });
-    } else if constexpr (std::is_same_v<T, double>) {
+    };
+
+    if (only_int) {
+        //bf16 has __bf16 and float cast funtion on x86 which has native bfloat16.
+        //directly cast to int64_t would occur ambiguous
+        if constexpr (std::is_same_v<T, bfloat16> || std::is_same_v<T, half>){
+            std::uniform_int_distribution<int64_t> dis(static_cast<int64_t>(static_cast<float>(start)), static_cast<int64_t>(static_cast<float>(stop)));
+            fill_with_distribution(dis);
+        }
+        else{
+            std::uniform_int_distribution<int64_t> dis(static_cast<int64_t>(start), static_cast<int64_t>(stop));
+            fill_with_distribution(dis);
+        }
+    } else {
         std::uniform_real_distribution<double> dis(start, stop);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<double>(dis(gen));
-        });
-    } else if constexpr (std::is_same_v<T, bool>) {
-        std::uniform_real_distribution<double> dis(0.0, 1.0);
-        ntt::apply(tensor.shape(), [&](auto &index) {
-            tensor(index) = static_cast<double>(dis(gen)) >= 0.5;
-        });
-    } else if constexpr (std::is_same_v<T, bfloat16>) {
-        std::uniform_real_distribution<float> dis((float)start, (float)stop);
-        ntt::apply(tensor.shape(), [&]([[maybe_unused]] auto &index) {
-            [[maybe_unused]] auto temp = static_cast<float>(dis(gen));
-            [[maybe_unused]] auto temp1 = tensor(index);
-            // tensor(index) = static_cast<bfloat16>(dis(gen));
-        });
+        fill_with_distribution(dis);
+    }
+}
+
+template <typename T, TensorOrVector TTensor>
+requires(std::is_same_v<T, bool>)
+void generate_random_tensor(TTensor &tensor, std::mt19937 &gen, [[maybe_unused]]T start = static_cast<T>(0),
+                            [[maybe_unused]]T stop = static_cast<T>(1), [[maybe_unused]]bool allow_zr = true, [[maybe_unused]]bool only_int = false) {
+    std::uniform_int_distribution<int> dis(0, 1);
+    ntt::apply(tensor.shape(), [&](auto &index) {
+        tensor(index) = static_cast<bool>(dis(gen) < 0.5);
+    });
+}
+
+
+template <typename T>
+T nextToNeg1(T x) {
+    //TODO:  special handling for 0
+    // TODO: logic is wrong about negative numbers
+    // std::cout << "x:" << x << std::endl;
+    float x_f = static_cast<float>(x);
+    x_f = std::fabs(x_f);
+
+    // std::cout << "x_f:" << x_f << std::endl;
+    static_assert(sizeof(T) == 1 || sizeof(T) == 2,
+                  "nextToNeg1 only supports 8-bit or 16-bit formats");
+    using int_type = std::conditional_t<sizeof(T) == 1, std::uint8_t, std::uint16_t>;
+
+    T x_abs = static_cast<T>(x_f);
+    // std::cout << "x_abs:" << x_abs << std::endl;
+    
+    int_type x_i = std::bit_cast<int_type>(x_abs);
+    x_i = (x_i - 1);  
+    T x_lower = std::bit_cast<T>(x_i);
+    // std::cout << "x_lower" << x_lower <<std::endl;
+    return x_lower;
+}
+
+
+template <typename T> T ulp(T x) {
+    // For standard floating point types (float, double, long double)
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double> || std::is_same_v<T, long double>) {
+        x = std::fabs(x);
+        if (std::isfinite(x)) {
+            T lower = std::nextafter(x, static_cast<T>(-1.0));
+            return x - lower;
+        }
+        return x;
     } else {
-        std::cerr << __FUNCTION__ << ": unsupported data type" << std::endl;
-        std::abort();
+        // For custom floating point types (half, bfloat16, etc.)
+        // Convert to float for ULP computation
+
+        if (!std::isfinite((float)x)) {
+            return x;
+        }
+        //if(x == 0) //TODO
+        T x_abs = (T)std::fabs((float)x);
+        T lower = nextToNeg1(x_abs);
+        // printf("ulp: %f of %f\n", (float)(x_abs - lower), (float)x);
+        return x_abs - lower;
     }
 }
 
+template <typename T>
+bool are_close(T a, T b, double abs_tol = 1e-6,  double rel_tol = 1e-5) {
+    // The short-circuit for equality is important for performance and to handle infinities.
+    if (a == b) {
+        return true;
+    }
+    
+    // ULP check for all non-integer types (including float, half, double, etc.)
+    if constexpr (!std::is_integral_v<T>) {
+        // std::cout << "std::fabs(a-b) " << std::fabs((a-b))  <<std::endl;
+        // std::cout << "ulp(b):" <<ulp(b) << "   ulp(a)" << ulp(a) << std::endl;
+
+        if (std::fabs(double(a - b)) <= double(ulp(b)) || std::fabs(double(a - b)) <= double(ulp(a))) {
+            return true;
+        }
+    }
+    
+    // Special handling for float type: if a is float_max_from_exp and b is greater than float_max_from_exp, return true
+    if constexpr (std::is_same_v<T, float>) {
+        const T float_max_from_exp = 1.65164e+38f;
+        // Using relative tolerance for floating-point comparison to handle precision issues
+        if (std::abs(a - float_max_from_exp) <= std::max(abs_tol, rel_tol * std::max(std::abs(a), std::abs(float_max_from_exp))) && b > float_max_from_exp) {
+            return true;
+        } 
+    }
+
+
+    return std::abs(double(a - b)) <= std::max(abs_tol, rel_tol * std::max(std::abs(double(a)), std::abs(double(b))));
+}
+
+template <typename T, TensorOrVector TTensor> 
+requires(std::is_same_v<T, bool>)
+void generate_random_tensor(TTensor &tensor, std::mt19937 &gen, [[maybe_unused]] T start = static_cast<T>(0),
+                 [[maybe_unused]] T stop = static_cast<T>(1), [[maybe_unused]] bool allow_zr = true) {
+    std::uniform_int_distribution<int> dis(0, 1);
+    ntt::apply(tensor.shape(), [&](auto &index) {
+        tensor(index) = static_cast<bool>(dis(gen));
+    });
+}
+
+template <typename T, TensorOrVector TTensor>
+void init_tensor(TTensor &tensor, T start = static_cast<T>(0),
+                 T stop = static_cast<T>(1), bool allow_zr = true, [[maybe_unused]] bool only_int = false) {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    // } else if constexpr (std::is_same_v<T, bool>) {
+    //     std::uniform_real_distribution<double> dis(0.0, 1.0);
+    //     ntt::apply(tensor.shape(), [&](auto &index) {
+    //         tensor(index) = static_cast<double>(dis(gen)) >= 0.5;
+    //     });
+    generate_random_tensor(tensor, gen, start, stop, allow_zr, only_int);
+}
+
 template <typename T, TensorOfVector TTensor>
 void init_tensor(TTensor &tensor, T start = static_cast<T>(0),
-                 T stop = static_cast<T>(1)) {
+                 T stop = static_cast<T>(1), bool allow_zr = true, bool only_int = false) {
     ntt::apply(tensor.shape(),
-               [&](auto &index) { init_tensor(tensor(index), start, stop); });
+               [&](auto &index) { init_tensor(tensor(index), start, stop, allow_zr, only_int); });
+}
+
+inline double calculate_cosine_similarity(const std::vector<double>& v1, const std::vector<double>& v2) {
+
+    double dotProduct = std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0);
+    double norm1 = std::sqrt(std::inner_product(v1.begin(), v1.end(), v1.begin(), 0.0));
+    double norm2 = std::sqrt(std::inner_product(v2.begin(), v2.end(), v2.begin(), 0.0));
+    std::cout << "dotProduct: " << dotProduct << ", norm1: " << norm1
+              << ", norm2: " << norm2 << std::endl;
+    return dotProduct / (norm1 * norm2);
 }
 
 template <ntt::TensorOrVector TTensor1, ntt::TensorOrVector TTensor2>
@@ -205,33 +292,36 @@ bool compare_tensor(TTensor1 &lhs, TTensor2 &rhs, double threshold = 0.999f) {
 
     bool pass = true;
     nncase::ntt::apply(lhs.shape(), [&](auto index) {
+        auto lvalue = lhs(index);
+        auto rvalue = rhs(index);
         auto d1 = static_cast<double>(
-            static_cast<typename TTensor1::element_type>(lhs(index)));
+            (lhs(index)));
         auto d2 = static_cast<double>(
-            static_cast<typename TTensor2::element_type>(rhs(index)));
+            (rhs(index)));
         v1.push_back(d1);
         v2.push_back(d2);
-        if (d1 != d2) {
-            // std::cout << "index = (";
-            // for (size_t i = 0; i < index.rank(); i++)
-            //     std::cout << index[i] << " ";
-            // std::cout << "): lhs = " << d1 << ", rhs = " << d2 << std::endl;
+        if (!are_close(lvalue, rvalue)) {
+            // #ifndef NDEBUG
+            std::cout << "index = (";
+            for (size_t i = 0; i < index.rank(); i++)
+                std::cout << index[i] << " ";
+            std::cout << "): lhs = " << d1 << ", rhs = " << d2 << std::endl;
+            // #endif
             pass = false;
         }
     });
 
     if (!pass) {
-        double dotProduct =
-            std::inner_product(v1.begin(), v1.end(), v2.begin(), (double)0.0);
-        double norm1 = std::sqrt(
-            std::inner_product(v1.begin(), v1.end(), v1.begin(), (double)0.0));
-        double norm2 = std::sqrt(
-            std::inner_product(v2.begin(), v2.end(), v2.begin(), (double)0.0));
-        double cosine_similarity = dotProduct / (norm1 * norm2);
+        double dotProduct = std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0);
+        std::cout << "dotProduct" << dotProduct << std::endl;
+        double norm1 = std::sqrt(std::inner_product(v1.begin(), v1.end(), v1.begin(), 0.0));
+        std::cout << "norm1" << norm1 << std::endl;
+        double norm2 = std::sqrt(std::inner_product(v2.begin(), v2.end(), v2.begin(), 0.0));
+        std::cout << "norm2" << norm2 << std::endl;
+        double cosine_similarity = calculate_cosine_similarity(v1, v2);
         pass = cosine_similarity > threshold;
         if (!pass)
-            std::cerr << "cosine_similarity = " << cosine_similarity
-                      << std::endl;
+            std::cerr << "cosine_similarity = " << cosine_similarity << std::endl;
     }
     return pass;
 }
@@ -271,24 +361,20 @@ bool compare_tensor(TTensor1 &lhs, TTensor2 &rhs, double threshold = 0.999f) {
             // auto d2 = int32_t(rvalue(idx));
             v1.push_back(d1);
             v2.push_back(d2);
-            if (d1 != d2) {
+            if (!are_close(d1, d2)) {
+                // #ifndef NDEBUG
                 std::cout << "index = (";
                 for (size_t i = 0; i < index.rank(); i++)
                     std::cout << index[i] << " ";
                 std::cout << "): lhs = " << d1 << ", rhs = " << d2 << std::endl;
+                // #endif
                 pass = false;
             }
         });
     });
 
     if (!pass) {
-        double dotProduct =
-            std::inner_product(v1.begin(), v1.end(), v2.begin(), (double)0.0);
-        double norm1 = std::sqrt(
-            std::inner_product(v1.begin(), v1.end(), v1.begin(), (double)0.0));
-        double norm2 = std::sqrt(
-            std::inner_product(v2.begin(), v2.end(), v2.begin(), (double)0.0));
-        double cosine_similarity = dotProduct / (norm1 * norm2);
+        double cosine_similarity = calculate_cosine_similarity(v1, v2);
         pass = cosine_similarity > threshold;
         if (!pass)
             std::cerr << "cosine_similarity = " << cosine_similarity
@@ -298,10 +384,11 @@ bool compare_tensor(TTensor1 &lhs, TTensor2 &rhs, double threshold = 0.999f) {
 }
 
 // 2D vector
-template <ntt::TensorOfVector TTensor>
-    requires(TTensor::element_type::rank() == 2)
-bool compare_tensor(TTensor &lhs, TTensor &rhs, double threshold = 0.999f) {
-    using vector_type = typename TTensor::element_type;
+template <ntt::TensorOfVector TTensor1, ntt::TensorOfVector TTensor2>
+    requires(TTensor1::element_type::rank() == 2 &&
+             TTensor2::element_type::rank() == 2)
+bool compare_tensor(TTensor1 &lhs, TTensor2 &rhs, double threshold = 0.999f) {
+    using vector_type = typename TTensor1::element_type;
     constexpr size_t N0 = vector_type::template lane<0>();
     constexpr size_t N1 = vector_type::template lane<1>();
 
@@ -332,24 +419,20 @@ bool compare_tensor(TTensor &lhs, TTensor &rhs, double threshold = 0.999f) {
                     rvalue(idx)));
             v1.push_back(d1);
             v2.push_back(d2);
-            if (d1 != d2) {
+            if (!are_close(d1, d2)) {
+                // #ifndef NDEBUG
                 std::cout << "index = (";
                 for (size_t i = 0; i < index.rank(); i++)
                     std::cout << index[i] << " ";
                 std::cout << "): lhs = " << d1 << ", rhs = " << d2 << std::endl;
+                // #endif
                 pass = false;
             }
         });
     });
 
     if (!pass) {
-        double dotProduct =
-            std::inner_product(v1.begin(), v1.end(), v2.begin(), (double)0.0);
-        double norm1 = std::sqrt(
-            std::inner_product(v1.begin(), v1.end(), v1.begin(), (double)0.0));
-        double norm2 = std::sqrt(
-            std::inner_product(v2.begin(), v2.end(), v2.begin(), (double)0.0));
-        double cosine_similarity = dotProduct / (norm1 * norm2);
+        double cosine_similarity = calculate_cosine_similarity(v1, v2);
         pass = cosine_similarity > threshold;
         if (!pass)
             std::cerr << "cosine_similarity = " << cosine_similarity
@@ -360,23 +443,34 @@ bool compare_tensor(TTensor &lhs, TTensor &rhs, double threshold = 0.999f) {
 
 template <ntt::TensorOrVector TTensor>
 void print_tensor(TTensor &tensor, std::string name) {
-    std::cout << name << std::endl;
+    printf("%s\n", name.c_str() );
     using element_type = typename TTensor::element_type;
     if constexpr (ntt::Vector<element_type>) {
         nncase::ntt::apply(tensor.shape(), [&](auto index) {
-            const auto vec = tensor(index);
-            nncase::ntt::apply(vec.shape(), [&](auto idx) {
-                auto d1 = int32_t(vec(idx));
-                std::cout << d1 << " ";
-            });
+            print_tensor(tensor(index), name + "[" +
+                                       std::to_string(index[0]) + "]");
+
         });
     } else {
         nncase::ntt::apply(tensor.shape(), [&](auto index) {
-            std::cout << int32_t(tensor(index)) << " ";
+            auto value = tensor(index);
+            using value_type = decltype(value);
+            if constexpr (std::is_integral_v<value_type> && !std::is_same_v<value_type, bool>) {
+                printf("%ld ", static_cast<int64_t>(value));
+            } else {
+                if constexpr (requires { typename decltype(value)::element_type; }) {
+                    // value is a proxy, extract the element
+                    auto act_val = static_cast<typename decltype(value)::element_type>(value);
+                    printf("%lf ", static_cast<double>(act_val));
+                } else {
+                    // value is already the actual type
+                    printf("%lf ", static_cast<double>(value));
+                }
+            }
         });
     }
 
-    std::cout << std::endl;
+    printf("\n");
 }
 
 template <ntt::TensorOrVector TTensor_src, ntt::TensorOrVector TTensor_dst>
@@ -418,14 +512,6 @@ void reinterpret_cast_fp8_to_uint8(const TTensor_src &tensor_src,
 //     std::cout << std::endl;
 // }
 
-template <typename T> T ulp(T x) {
-    x = std::fabs(x);
-    if (std::isfinite(x)) {
-        T lower = std::nextafter(x, static_cast<T>(-1.0));
-        return x - lower;
-    }
-    return x;
-}
 
 template <typename T, typename Shape, typename Stride>
 bool compare_ulp(ntt::tensor<T, Shape, Stride> &lhs,
diff --git a/ntt/test/ortki_helper.h b/ntt/test/ortki_helper.h
index d37c38f4bf..23c97f5ad2 100644
--- a/ntt/test/ortki_helper.h
+++ b/ntt/test/ortki_helper.h
@@ -84,16 +84,16 @@ ortki::OrtKITensor *ntt2ort(TTensor &tensor) {
 template <ntt::TensorOfVector TTensor>
 ortki::OrtKITensor *ntt2ort(TTensor &tensor) {
     using vec_type = typename std::decay_t<TTensor>::element_type;
-    size_t N = vec_type::shape()[0];
     auto RankDim = vec_type::rank();
     using vec_elem_type = ntt::element_or_scalar_t<vec_type>;
     auto ort_type = primitive_type2ort_type<vec_elem_type>();
     auto r1 = tensor.shape().rank();
     auto r2 = r1 + RankDim;
-    std::vector<size_t> v(r2, N);
+    std::vector<size_t> v(r2, 0);
     for (size_t i = 0; i < r1; i++)
         v[i] = tensor.shape()[i];
-
+    for (size_t i = r1; i < r2; i++)
+        v[i] = vec_type::shape()[i-r1];
     vec_elem_type *buffer = new vec_elem_type[tensor.shape().length() * vec_type::size()];
     vec_elem_type *buffer_ptr = buffer;
     ntt::apply(tensor.shape(), [&](auto tindex) {
@@ -154,38 +154,123 @@ void print_ort_shape(ortki::OrtKITensor *ort_tensor) {
     }
 }
 
+template<typename T>
+constexpr size_t get_element_rank() {
+    using element_type = typename std::decay_t<T>::element_type;
+    if constexpr (ntt::Vector<element_type>) {
+        return element_type::rank();
+    } else {
+        return 0;
+    }
+}
+
+template<typename T>
+void reshape_with_vector_alignment(ortki::OrtKITensor *&ort_tensor, const T &ntt_tensor, size_t higher_vector_rank) {
+    assert(higher_vector_rank > 0);
+    
+    auto rank = ntt_tensor.shape().rank();
+    std::vector<int64_t> new_shape_data;
+    
+    constexpr auto lower_vector_rank = get_element_rank<std::decay_t<T>>();
+    
+    new_shape_data.reserve(rank + higher_vector_rank);
+
+    for (size_t i = 0; i < rank; ++i) { 
+        new_shape_data.push_back(ntt_tensor.shape()[i]);
+    }
+    for (size_t i = 0; i < higher_vector_rank; ++i) {
+        new_shape_data.push_back(1); 
+    }
+    if constexpr (lower_vector_rank > 0) {
+        static_assert(lower_vector_rank == 1, "only support 1D vectors");
+        using tensor_element_type = typename std::decay_t<T>::element_type;
+        new_shape_data[rank+higher_vector_rank-1] = tensor_element_type::size();
+    }
+
+    int64_t reshape_shape[] = {static_cast<int64_t>(new_shape_data.size())};
+    auto ort_type = NttTest::primitive_type2ort_type<int64_t>();
+    auto shape_tensor = make_tensor(reinterpret_cast<void *>(new_shape_data.data()),
+                                   ort_type, reshape_shape, std::size(reshape_shape));
+    ort_tensor = ortki_Reshape(ort_tensor, shape_tensor, 0);
+}
+
+template<typename T>
+void reshape_for_outer_product(ortki::OrtKITensor *&ort_tensor, const T &ntt_tensor, bool is_lhs) {
+    auto rank = ntt_tensor.shape().rank();
+    std::vector<int64_t> new_shape_data;
+    
+    // Get vector length
+    auto get_vlen = [&]() {
+        if constexpr (get_element_rank<std::decay_t<T>>() > 0) {
+            using tensor_element_type = typename std::decay_t<T>::element_type;
+            return tensor_element_type::size();
+        }
+        return 1ul;
+    };
+    
+    int64_t vlen = get_vlen();
+    
+    // Copy existing tensor shape
+    for (size_t i = 0; i < rank; ++i) {
+        new_shape_data.push_back(ntt_tensor.shape()[i]);
+    }
+    
+    // Add outer product dimensions
+    if (is_lhs) {
+        // lhs: [..., lhs_vlen, 1]
+        new_shape_data.push_back(vlen);
+        new_shape_data.push_back(1);
+    } else {
+        // rhs: [..., 1, rhs_vlen]
+        new_shape_data.push_back(1);
+        new_shape_data.push_back(vlen);
+    }
+    
+    int64_t reshape_shape[] = {static_cast<int64_t>(new_shape_data.size())};
+    auto ort_type = NttTest::primitive_type2ort_type<int64_t>();
+    auto shape_tensor = make_tensor(reinterpret_cast<void *>(new_shape_data.data()),
+                                   ort_type, reshape_shape, std::size(reshape_shape));
+    ort_tensor = ortki_Reshape(ort_tensor, shape_tensor, 0);
+}
+
+//reshape means 
+// 1. append dimension 1 at the last dimension which shoule be vector dimensions of ntt dimension
+//    intput :lhs: (2 * 3 * 4) tensor of vector<2 * 4> rhs: (2 * 1 * 4) tensor of vector <4>
+//    output :lhs  (2 * 3 * 4 * 2 * 4), rhs: (2 * 1 * 4 * "1" * 4)
+// 2. for outer_product
+//   input: lhs: 3 * 4 tensor of vector <8>  rhs: 3*4 tensor of vector <4>
+//   output: lhs: 3 * 4 * 8 * 1, rhs: 3*4 * 1 * 4
+//3. if need cast, cast the ort tensor into double
 template <ntt::TensorOrVector TLhs, ntt::TensorOrVector TRhs>
-auto convert_and_align_to_ort(TLhs &lhs, TRhs &rhs) {
+auto convert_and_align_to_ort(TLhs &lhs, TRhs &rhs, bool need_cast = false,  bool for_outer_product = false) {
     auto ort_lhs = NttTest::ntt2ort(lhs);
     auto ort_rhs = NttTest::ntt2ort(rhs);
 
-    constexpr bool lhs_is_vec = ntt::Vector<typename TLhs::element_type>;
-    constexpr bool rhs_is_vec = ntt::Vector<typename TRhs::element_type>;
-    // TODO: deal with the case that 2D vector and 1D vector
-    auto reshape_op = [](auto &orttensor_to_append,
-                         const auto &ntttensor_to_append) {
-        auto rank = ntttensor_to_append.shape().rank();
-        std::vector<int64_t> new_shape_data;
-        new_shape_data.reserve(rank + 1);
-        for (size_t i = 0; i < rank; ++i) {
-            new_shape_data.push_back(ntttensor_to_append.shape()[i]);
-        }
-        new_shape_data.push_back(1);
-        int64_t reshape_shape[] = {static_cast<int64_t>(new_shape_data.size())};
-        auto ort_type = NttTest::primitive_type2ort_type<int64_t>();
-        auto shape_tensor =
-            make_tensor(reinterpret_cast<void *>(new_shape_data.data()),
-                        ort_type, reshape_shape, std::size(reshape_shape));
-        orttensor_to_append =
-            ortki_Reshape(orttensor_to_append, shape_tensor, 0);
-    };
+    constexpr size_t lhs_vector_rank = get_element_rank<TLhs>();
+    constexpr size_t rhs_vector_rank = get_element_rank<TRhs>();
+    
+    if constexpr (lhs_vector_rank > rhs_vector_rank) {
+        reshape_with_vector_alignment(ort_rhs, rhs, lhs_vector_rank);
+    } else if constexpr (lhs_vector_rank < rhs_vector_rank) {
+        reshape_with_vector_alignment(ort_lhs, lhs, rhs_vector_rank);
+    }
+    if (for_outer_product) {
+        // For outer product, we need to reshape tensors for broadcasting
+        // lhs should be reshaped to [..., lhs_vlen, 1]
+        // rhs should be reshaped to [..., 1, rhs_vlen]
+        // if element type is scalar, the *hs_vlen will be 1
+        reshape_for_outer_product(ort_lhs, lhs, true);
+        reshape_for_outer_product(ort_rhs, rhs, false);
+    }
+    
 
-    if constexpr (lhs_is_vec && !rhs_is_vec) {
-        reshape_op(ort_rhs, rhs);
-    } else if constexpr (!lhs_is_vec && rhs_is_vec) {
-        reshape_op(ort_lhs, lhs);
+    if(need_cast){
+        ort_lhs = ortki_Cast(ort_lhs,1,  ortki::DataType_DOUBLE);
+        ort_rhs = ortki_Cast(ort_rhs,1,  ortki::DataType_DOUBLE);
     }
+
     return std::make_pair(ort_lhs, ort_rhs);
 }
+
 } // namespace NttTest
 } // namespace nncase
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 622b7bb67e..d6441511a6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,7 +53,7 @@ archs = ["x86_64"]
 before-build = [
   "rm -f {project}/CMakeUserPresets.json",
   "pip install https://github.com/sunnycase/auditwheel/releases/download/6.0.0/auditwheel-6.0.0-py3-none-any.whl",
-  "if [ ! -d abseil-cpp ]; then git clone https://github.com/abseil/abseil-cpp.git && cd abseil-cpp && git checkout lts_2025_05_12 && mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=/usr/ && make -j$(nproc) && make install; fi"
+  "if [ ! -d abseil-cpp ]; then git clone https://github.com/abseil/abseil-cpp.git && cd abseil-cpp && git checkout lts_2025_05_12 && mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=/usr/ && make -j$(nproc) && make install; fi"
 ]
 repair-wheel-command = "LD_LIBRARY_PATH=/usr/lib64 auditwheel repair -w {dest_dir} {wheel} --exclude libvulkan.so.1,libgomp.so.1"
 
diff --git a/setup.py b/setup.py
index f7af9c339a..7817fbbc27 100644
--- a/setup.py
+++ b/setup.py
@@ -264,7 +264,9 @@ def build_cmake(self, ext: Extension):
                 os.walk(os.path.join(ext.sourcedir, 'install')) for _lib in files if
                 os.path.isfile(os.path.join(root, _lib)) and
                 (os.path.splitext(_lib)[-1] in [".dll", ".so", ".dylib", ".json"] or
-                _lib.startswith("lib"))]
+                _lib.startswith("lib")) and
+                # Exclude problematic shared libraries that cause auditwheel issues
+                not os.path.basename(_lib) in ["libisl.so", "google-ortools-native.so", "libortki.so"]]
 
         sharp_libs_dir = os.path.join(bin_dir, 'sharplibs')
         os.makedirs(sharp_libs_dir)