diff --git a/src/core/framework/index_meta.cc b/src/core/framework/index_meta.cc
index 11d54cb63..d0eadb02d 100644
--- a/src/core/framework/index_meta.cc
+++ b/src/core/framework/index_meta.cc
@@ -30,7 +30,8 @@ struct IndexMetaFormatHeader {
   uint32_t space_id;
   uint32_t attachment_offset;
   uint32_t attachment_size;
-  uint8_t reserved_[4092];
+  uint32_t extra_meta_size;
+  uint8_t reserved_[4088];
 };
 
 static_assert(sizeof(IndexMetaFormatHeader) % 32 == 0,
@@ -47,6 +48,7 @@ void IndexMeta::serialize(std::string *out) const {
   format.dimension = dimension_;
   format.unit_size = unit_size_;
   format.space_id = space_id_;
+  format.extra_meta_size = extra_meta_size_;
 
   if (!metric_name_.empty()) {
     ailego::Params item;
diff --git a/src/core/metric/quantized_integer_metric.cc b/src/core/metric/quantized_integer_metric.cc
index e4db83146..bbb2e587d 100644
--- a/src/core/metric/quantized_integer_metric.cc
+++ b/src/core/metric/quantized_integer_metric.cc
@@ -96,24 +96,44 @@ class QuantizedIntegerMetric : public IndexMetric {
     switch (origin_metric_type_) {
       case MetricType::kSquaredEuclidean:
         if (meta_.data_type() == IndexMeta::DataType::DT_INT8) {
-          auto turbo_ret = turbo::get_distance_func(
-              turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
-              turbo::QuantizeType::kDefault);
+          auto turbo_ret =
+              turbo::get_distance_func(turbo::MetricType::kSquaredEuclidean,
+                                       turbo::DataType::kInt8, quantize_type_);
           if (turbo_ret && m == 1 && n == 1) {
             return turbo_ret;
           }
           return DistanceMatrixCompute<SquaredEuclidean, int8_t>(m, n);
         }
         if (meta_.data_type() == IndexMeta::DataType::DT_INT4) {
+          auto turbo_ret =
+              turbo::get_distance_func(turbo::MetricType::kSquaredEuclidean,
+                                       turbo::DataType::kInt4, quantize_type_);
+          if (turbo_ret && m == 1 && n == 1) {
+            return turbo_ret;
+          }
+
           return DistanceMatrixCompute<SquaredEuclidean, uint8_t>(m, n);
         }
         break;
 
       case MetricType::kInnerProduct:
         if (meta_.data_type() == IndexMeta::DataType::DT_INT8) {
+          auto turbo_ret =
+              turbo::get_distance_func(turbo::MetricType::kInnerProduct,
+                                       turbo::DataType::kInt8, quantize_type_);
+          if (turbo_ret && m == 1 && n == 1) {
+            return turbo_ret;
+          }
           return DistanceMatrixCompute<MinusInnerProduct, int8_t>(m, n);
         }
+
         if (meta_.data_type() == IndexMeta::DataType::DT_INT4) {
+          auto turbo_ret =
+              turbo::get_distance_func(turbo::MetricType::kInnerProduct,
+                                       turbo::DataType::kInt4, quantize_type_);
+          if (turbo_ret && m == 1 && n == 1) {
+            return turbo_ret;
+          }
           return DistanceMatrixCompute<MinusInnerProduct, uint8_t>(m, n);
         }
         break;
@@ -137,9 +157,9 @@ class QuantizedIntegerMetric : public IndexMetric {
         break;
       case MetricType::kCosine:
         if (meta_.data_type() == IndexMeta::DataType::DT_INT8) {
-          auto turbo_ret = turbo::get_distance_func(
-              turbo::MetricType::kCosine, turbo::DataType::kInt8,
-              turbo::QuantizeType::kDefault);
+          auto turbo_ret =
+              turbo::get_distance_func(turbo::MetricType::kCosine,
+                                       turbo::DataType::kInt8, quantize_type_);
           if (turbo_ret) {
             return turbo_ret;
           }
@@ -160,7 +180,7 @@ class QuantizedIntegerMetric : public IndexMetric {
         if (meta_.data_type() == IndexMeta::DataType::DT_INT8) {
           auto turbo_ret = turbo::get_batch_distance_func(
               turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
-              turbo::QuantizeType::kDefault);
+              quantize_type_);
           if (turbo_ret) {
             return turbo_ret;
           }
@@ -215,7 +235,7 @@ class QuantizedIntegerMetric : public IndexMetric {
         if (meta_.data_type() == IndexMeta::DataType::DT_INT8) {
           auto turbo_ret = turbo::get_batch_distance_func(
               turbo::MetricType::kCosine, turbo::DataType::kInt8,
-              turbo::QuantizeType::kDefault);
+              quantize_type_);
           if (turbo_ret) {
             return turbo_ret;
           }
diff --git a/src/include/zvec/core/framework/index_meta.h b/src/include/zvec/core/framework/index_meta.h
index 3a09aaefb..a11af00f4 100644
--- a/src/include/zvec/core/framework/index_meta.h
+++ b/src/include/zvec/core/framework/index_meta.h
@@ -40,6 +40,7 @@ class IndexMeta {
     DT_BINARY64 = 8,
   };
 
+
   /*! Major Orders
    */
   enum MajorOrder {
@@ -586,6 +587,7 @@ class IndexMeta {
   uint32_t dimension_{0};
   uint32_t unit_size_{0};
   uint32_t element_size_{0};
+  uint32_t extra_meta_size_{0};
   uint64_t space_id_{0};
   uint32_t metric_revision_{0};
   uint32_t converter_revision_{0};
@@ -708,6 +710,8 @@ class IndexQueryMeta {
   uint32_t dimension_{0};
   uint32_t unit_size_{0};
   uint32_t element_size_{0};
+  uint32_t extra_meta_size_{0};
+  uint32_t quantize_type_{0};
 };
 
 }  // namespace core
diff --git a/src/include/zvec/core/framework/index_metric.h b/src/include/zvec/core/framework/index_metric.h
index 24d772362..eeb54099f 100644
--- a/src/include/zvec/core/framework/index_metric.h
+++ b/src/include/zvec/core/framework/index_metric.h
@@ -137,6 +137,9 @@ struct IndexMetric : public IndexModule {
   virtual DistanceBatchQueryPreprocessFunc get_query_preprocess_func() const {
     return nullptr;
   }
+
+ private:
+  int quantize_type_{0};
 };
 
 }  // namespace core
diff --git a/src/include/zvec/turbo/turbo.h b/src/include/zvec/turbo/turbo.h
index 6ecbfdd1e..f07ace8c6 100644
--- a/src/include/zvec/turbo/turbo.h
+++ b/src/include/zvec/turbo/turbo.h
@@ -28,28 +28,51 @@ using QueryPreprocessFunc =
 enum class MetricType {
   kSquaredEuclidean,
   kCosine,
+  kInnerProduct,
   kMipsSquaredEuclidean,
   kUnknown,
 };
 
 enum class DataType {
+  kInt4,
   kInt8,
+  kFp16,
+  kFp32,
   kUnknown,
 };
 
 enum class QuantizeType {
   kDefault,
+  kRecordInt8,
+  kRecordInt4,
+  kInt8,
+  kInt4,
+  kFp16,
+  kPQ,
+  kRabit
+};
+
+enum class CpuArchType {
+  kAuto,
+  kScalar,
+  kSSE,
+  kAVX,
+  kAVX2,
+  kAVX512,
+  kAVX512VNNI,
+  kAVX512FP16
 };
 
 DistanceFunc get_distance_func(MetricType metric_type, DataType data_type,
-                               QuantizeType quantize_type);
+                               QuantizeType quantize_type,
+                               CpuArchType cpu_arch_type = CpuArchType::kAuto);
 
-BatchDistanceFunc get_batch_distance_func(MetricType metric_type,
-                                          DataType data_type,
-                                          QuantizeType quantize_type);
+BatchDistanceFunc get_batch_distance_func(
+    MetricType metric_type, DataType data_type, QuantizeType quantize_type,
+    CpuArchType cpu_arch_type = CpuArchType::kAuto);
 
-QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type,
-                                              DataType data_type,
-                                              QuantizeType quantize_type);
+QueryPreprocessFunc get_query_preprocess_func(
+    MetricType metric_type, DataType data_type, QuantizeType quantize_type,
+    CpuArchType cpu_arch_type = CpuArchType::kAuto);
 
 }  // namespace zvec::turbo
diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt
index 9cbb2fac7..e51f72b1a 100644
--- a/src/turbo/CMakeLists.txt
+++ b/src/turbo/CMakeLists.txt
@@ -13,18 +13,71 @@ endif()
 
 file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h)
 
-# Set per-file compile flags for AVX512-VNNI sources.
-# set_source_files_properties is directory-scoped, so it must be called in the
-# same directory that adds the sources to a target (i.e. here, not in a
-# subdirectory).
 if(NOT ANDROID AND AUTO_DETECT_ARCH)
     if (HOST_ARCH MATCHES "^(x86|x64)$")
-        file(GLOB_RECURSE AVX512_VNNI_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512_vnni/*.cc)
+        file(GLOB_RECURSE AVX512_AVX512FP16_SRCS 
+          ${CMAKE_CURRENT_SOURCE_DIR}/avx512_fp16/*.cc
+          ${CMAKE_CURRENT_SOURCE_DIR}/avx512_fp16/*.c)
+        set_source_files_properties(
+            ${AVX512_AVX512FP16_SRCS}
+            PROPERTIES
+            COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512FP16}"
+        )
+
+        # Set per-file compile flags for AVX512-VNNI sources.
+        # set_source_files_properties is directory-scoped, so it must be called in the
+        # same directory that adds the sources to a target (i.e. here, not in a
+        # subdirectory).
+        file(GLOB_RECURSE AVX512_VNNI_SRCS 
+          ${CMAKE_CURRENT_SOURCE_DIR}/avx512_vnni/*.cc
+          ${CMAKE_CURRENT_SOURCE_DIR}/avx512_vnni/*.c)
         set_source_files_properties(
             ${AVX512_VNNI_SRCS}
             PROPERTIES
             COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}"
         )
+
+        file(GLOB_RECURSE AVX512_SRCS 
+          ${CMAKE_CURRENT_SOURCE_DIR}/avx512/*.cc
+          ${CMAKE_CURRENT_SOURCE_DIR}/avx512/*.c)
+        set_source_files_properties(
+            ${AVX512_SRCS}
+            PROPERTIES
+            COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}"
+        )
+    
+        file(GLOB_RECURSE AVX2_SRCS 
+          ${CMAKE_CURRENT_SOURCE_DIR}/avx2/*.cc 
+          ${CMAKE_CURRENT_SOURCE_DIR}/avx2/*.c
+          ${CMAKE_CURRENT_SOURCE_DIR}/avx/*.cc
+          ${CMAKE_CURRENT_SOURCE_DIR}/avx/*.c)
+        set_source_files_properties(
+            ${AVX2_SRCS}
+            PROPERTIES
+            COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX2}"
+        )
+
+        file(GLOB_RECURSE SSE_SRCS 
+          ${CMAKE_CURRENT_SOURCE_DIR}/sse/*.cc
+          ${CMAKE_CURRENT_SOURCE_DIR}/sse/*.c)
+        set_source_files_properties(
+            ${SSE_SRCS}
+            PROPERTIES
+            COMPILE_FLAGS "${TURBO_MARCH_FLAG_SSE}"
+        )
+    elseif (HOST_ARCH MATCHES "^(arm|arm64)$")
+        set(TURBO_MARCH_FLAG_NEON "-march=armv8-a")
+
+        file(GLOB_RECURSE NEON_SRCS
+          ${CMAKE_CURRENT_SOURCE_DIR}/armv8/*.cc
+          ${CMAKE_CURRENT_SOURCE_DIR}/armv8/*.c
+        )
+
+        set_source_files_properties(
+            ${NEON_SRCS}
+            PROPERTIES
+            COMPILE_FLAGS "${TURBO_MARCH_FLAG_NEON}"
+        )
     endif()
 endif()
 
diff --git a/src/turbo/armv8/float32/cosine.cc b/src/turbo/armv8/float32/cosine.cc
new file mode 100644
index 000000000..7e2b990d7
--- /dev/null
+++ b/src/turbo/armv8/float32/cosine.cc
@@ -0,0 +1,62 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "armv8/float32/cosine.h"
+#include "armv8/float32/inner_product.h"
+#include "armv8/float32/inner_product_common.h"
+
+namespace zvec::turbo::armv8 {
+
+void cosine_fp32_distance(const void *a, const void *b, size_t dim,
+                          size_t extra_size, float *distance) {
+#if defined(__ARM_NEON)
+  constexpr size_t extra_dim = 2;
+  size_t original_dim = dim - extra_dim;
+
+  float ip;
+  internal::inner_product_fp32_armv8(a, b, original_dim, &ip);
+
+  *distance = 1 - ip;
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __ARM_NEON
+}
+
+void cosine_fp32_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__ARM_NEON)
+  const int original_dim = dim - 1;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_fp32_batch_armv8(vectors, query, n, original_dim,
+                                           distances);
+
+  for (int i = 0; i < n; ++i) {
+    distances[i] = 1 - distances[i];
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__ARM_NEON
+}
+
+}  // namespace zvec::turbo::armv8
\ No newline at end of file
diff --git a/src/turbo/armv8/float32/cosine.h b/src/turbo/armv8/float32/cosine.h
new file mode 100644
index 000000000..529e11ef3
--- /dev/null
+++ b/src/turbo/armv8/float32/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::armv8 {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized FP32 vector pair.
+void cosine_fp32_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_fp32_distance.
+void cosine_fp32_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::armv8
\ No newline at end of file
diff --git a/src/turbo/armv8/float32/inner_product.cc b/src/turbo/armv8/float32/inner_product.cc
new file mode 100644
index 000000000..7cfbd7784
--- /dev/null
+++ b/src/turbo/armv8/float32/inner_product.cc
@@ -0,0 +1,52 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+#include <zvec/ailego/utility/float_helper.h>
+#include "armv8/float32/inner_product.h"
+#include "armv8/float32/inner_product_common.h"
+
+using namespace zvec::turbo::armv8::internal;
+#endif
+
+namespace zvec::turbo::armv8 {
+
+// Compute squared Euclidean distance between a single quantized FP32
+// vector pair.
+void inner_product_fp32_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__ARM_NEON)
+  inner_product_fp32_armv8(a, b, dim, distance);
+#endif
+}
+
+// Batch version of inner_product_fp16_distance.
+void inner_product_fp32_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__ARM_NEON)
+  inner_product_fp32_batch_armv8(vectors, query, n, dim, distances);
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  // __ARM_NEON
+}
+
+}  // namespace zvec::turbo::armv8
\ No newline at end of file
diff --git a/src/turbo/armv8/float32/inner_product.h b/src/turbo/armv8/float32/inner_product.h
new file mode 100644
index 000000000..a1d8b612f
--- /dev/null
+++ b/src/turbo/armv8/float32/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::armv8 {
+
+// Compute inner product distance between a single quantized FP32
+// vector pair.
+void inner_product_fp32_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_fp32_distance.
+void inner_product_fp32_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::armv8
diff --git a/src/turbo/armv8/float32/inner_product_common.h b/src/turbo/armv8/float32/inner_product_common.h
new file mode 100644
index 000000000..26ad45d21
--- /dev/null
+++ b/src/turbo/armv8/float32/inner_product_common.h
@@ -0,0 +1,142 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__ARM_NEON)
+#include <array>
+#include <cstdint>
+#include <arm_neon.h>
+#include <zvec/ailego/utility/float_helper.h>
+
+using namespace zvec::ailego;
+
+//! Calculate Fused-Multiply-Add (GENERAL)
+#define FMA_FP32_GENERAL(m, q, sum) sum += (m * q);
+
+namespace zvec::turbo::armv8::internal {
+
+static __attribute__((always_inline)) void inner_product_fp32_armv8(
+    const void *a, const void *b, size_t size, float *distance) {
+  const float *lhs = reinterpret_cast<const float *>(a);
+  const float *rhs = reinterpret_cast<const float *>(b);
+
+  const float *last = lhs + size;
+  const float *last_aligned = lhs + ((size >> 3) << 3);
+
+  float32x4_t v_sum_0 = vdupq_n_f32(0);
+  float32x4_t v_sum_1 = vdupq_n_f32(0);
+
+  for (; lhs != last_aligned; lhs += 8, rhs += 8) {
+    v_sum_0 = vfmaq_f32(v_sum_0, vld1q_f32(lhs + 0), vld1q_f32(rhs + 0));
+    v_sum_1 = vfmaq_f32(v_sum_1, vld1q_f32(lhs + 4), vld1q_f32(rhs + 4));
+  }
+  if (last >= last_aligned + 4) {
+    v_sum_0 = vfmaq_f32(v_sum_0, vld1q_f32(lhs), vld1q_f32(rhs));
+    lhs += 4;
+    rhs += 4;
+  }
+
+  float result = vaddvq_f32(vaddq_f32(v_sum_0, v_sum_1));
+  switch (last - lhs) {
+    case 3:
+      FMA_FP32_GENERAL(lhs[2], rhs[2], result)
+      /* FALLTHRU */
+    case 2:
+      FMA_FP32_GENERAL(lhs[1], rhs[1], result)
+      /* FALLTHRU */
+    case 1:
+      FMA_FP32_GENERAL(lhs[0], rhs[0], result)
+  }
+  *distance = -result;
+}
+
+template <size_t batch_size>
+static __attribute__((always_inline)) void inner_product_fp32_batch_armv8_impl(
+    const void *query, const void *const *vectors,
+    const std::array<const void *, batch_size> &prefetch_ptrs,
+    size_t dimensionality, float *distances) {
+  float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) {
+    v_sum[i] = vdupq_n_f32(0);
+  }
+
+  size_t dim = 0;
+  for (; dim + 64 <= dimensionality; dim += 4) {
+    for (size_t i = 0; i < batch_size; ++i) {
+      v_sum[i] = vfmaq_f32(
+          v_sum[i], vld1q_f32(reinterpret_cast<const float *>(query) + dim),
+          vld1q_f32(reinterpret_cast<const float *>(vectors[i]) + dim));
+    }
+  }
+
+  if (dim >= dimensionality + 4) {
+    for (size_t i = 0; i < batch_size; ++i) {
+      v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast<const float *>(query)+dim), vld1q_f32(reinterpret_cast<const float *>(vectors[i])+dim)));
+    }
+
+    dim += 4;
+  }
+
+  for (size_t i = 0; i < batch_size; ++i) {
+    float result = vaddvq_f32(v_sum[i]);
+    switch (last - lhs) {
+      case 3:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 2],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 2],
+                         result)
+        /* FALLTHRU */
+      case 2:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 1],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 1],
+                         result)
+        /* FALLTHRU */
+      case 1:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 0],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 0],
+                         result)
+    }
+
+    distances[i] = -result;
+  }
+}
+
+// Dispatch batched inner product over all `n` vectors with prefetching.
+static __attribute__((always_inline)) void inner_product_fp32_batch_armv8(
+    const void *const *vectors, const void *query, size_t n, size_t dim,
+    float *distances) {
+  static constexpr size_t batch_size = 2;
+  static constexpr size_t prefetch_step = 2;
+  size_t i = 0;
+  for (; i + batch_size <= n; i += batch_size) {
+    std::array<const void *, batch_size> prefetch_ptrs;
+    for (size_t j = 0; j < batch_size; ++j) {
+      if (i + j + batch_size * prefetch_step < n) {
+        prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step];
+      } else {
+        prefetch_ptrs[j] = nullptr;
+      }
+    }
+    inner_product_fp32_batch_armv8_impl<batch_size>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+  for (; i < n; i++) {
+    std::array<const void *, 1> prefetch_ptrs{nullptr};
+    inner_product_fp32_batch_armv8_impl<1>(query, &vectors[i], prefetch_ptrs,
+                                           dim, distances + i);
+  }
+}
+
+}  // namespace zvec::turbo::armv8::internal
+
+#endif  // defined(__ARM_NEON)
diff --git a/src/turbo/armv8/float32/squared_euclidean.cc b/src/turbo/armv8/float32/squared_euclidean.cc
new file mode 100644
index 000000000..b39fdac2e
--- /dev/null
+++ b/src/turbo/armv8/float32/squared_euclidean.cc
@@ -0,0 +1,54 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+#include <zvec/ailego/utility/float_helper.h>
+#include "armv8/float32/squared_euclidean.h"
+#include "armv8/float32/squared_euclidean_common.h"
+
+using namespace zvec::turbo::armv8::internal;
+#endif
+
+namespace zvec::turbo::armv8 {
+
+void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__ARM_NEON)
+  squared_euclidean_fp32_armv8(a, b, dim, distance);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __ARM_NEON
+}
+
+void squared_euclidean_fp32_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__ARM_NEON)
+  squared_euclidean_fp32_batch_armv8(vectors, query, n, dim, distances);
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //
+}
+
+}  // namespace zvec::turbo::armv8
\ No newline at end of file
diff --git a/src/turbo/armv8/float32/squared_euclidean.h b/src/turbo/armv8/float32/squared_euclidean.h
new file mode 100644
index 000000000..3df75f17a
--- /dev/null
+++ b/src/turbo/armv8/float32/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::armv8 {
+
+// Compute squared euclidean distance between a single quantized FP32
+// vector pair.
+void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean FP32.
+void squared_euclidean_fp32_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::armv8
diff --git a/src/turbo/armv8/float32/squared_euclidean_common.h b/src/turbo/armv8/float32/squared_euclidean_common.h
new file mode 100644
index 000000000..4f3419c56
--- /dev/null
+++ b/src/turbo/armv8/float32/squared_euclidean_common.h
@@ -0,0 +1,150 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__ARM_NEON)
+#include <array>
+#include <cstdint>
+#include <arm_neon.h>
+#include <zvec/ailego/utility/float_helper.h>
+
+using namespace zvec::ailego;
+
+//! Calculate Sum-of-Squared-Differences (GENERAL)
+#define SSD_FP32_GENERAL(m, q, sum) \
+  {                                 \
+    float x = m - q;                \
+    sum += (x * x);                 \
+  }
+
+namespace zvec::turbo::armv8::internal {
+
+static __attribute__((always_inline)) void squared_euclidean_fp32_armv8(
+    const void *a, const void *b, size_t size, float *distance) {
+  const float *lhs = reinterpret_cast<const float *>(a);
+  const float *rhs = reinterpret_cast<const float *>(b);
+
+  const float *last = lhs + size;
+  const float *last_aligned = lhs + ((size >> 3) << 3);
+
+  float32x4_t v_sum_0 = vdupq_n_f32(0);
+  float32x4_t v_sum_1 = vdupq_n_f32(0);
+
+  for (; lhs != last_aligned; lhs += 8, rhs += 8) {
+    float32x4_t v_d_0 = vsubq_f32(vld1q_f32(lhs + 0), vld1q_f32(rhs + 0));
+    float32x4_t v_d_1 = vsubq_f32(vld1q_f32(lhs + 4), vld1q_f32(rhs + 4));
+    v_sum_0 = vfmaq_f32(v_sum_0, v_d_0, v_d_0);
+    v_sum_1 = vfmaq_f32(v_sum_1, v_d_1, v_d_1);
+  }
+  if (last >= last_aligned + 4) {
+    float32x4_t v_d = vsubq_f32(vld1q_f32(lhs), vld1q_f32(rhs));
+    v_sum_0 = vfmaq_f32(v_sum_0, v_d, v_d);
+    lhs += 4;
+    rhs += 4;
+  }
+
+  float result = vaddvq_f32(vaddq_f32(v_sum_0, v_sum_1));
+  switch (last - lhs) {
+    case 3:
+      SSD_FP32_GENERAL(lhs[2], rhs[2], result)
+      /* FALLTHRU */
+    case 2:
+      SSD_FP32_GENERAL(lhs[1], rhs[1], result)
+      /* FALLTHRU */
+    case 1:
+      SSD_FP32_GENERAL(lhs[0], rhs[0], result)
+  }
+  *distance = result;
+}
+
+template <size_t batch_size>
+static __attribute__((always_inline)) void
+squared_euclidean_fp32_batch_armv8_impl(
+    const void *query, const void *const *vectors,
+    const std::array<const void *, batch_size> &prefetch_ptrs,
+    size_t dimensionality, float *distances) {
+  float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) {
+    v_sum[i] = vdupq_n_f32(0);
+  }
+
+  size_t dim = 0;
+  for (; dim + 64 <= dimensionality; dim += 4) {
+    for (size_t i = 0; i < batch_size; ++i) {
+      v_sum[i] = vfmaq_f32(
+          v_sum[i], vld1q_f32(reinterpret_cast<const float *>(query) + dim),
+          vld1q_f32(reinterpret_cast<const float *>(vectors[i]) + dim));
+    }
+  }
+
+  if (dim >= dimensionality + 4) {
+    for (size_t i = 0; i < batch_size; ++i) {
+      v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast<const float *>(query)+dim), vld1q_f32(reinterpret_cast<const float *>(vectors[i])+dim)));
+    }
+
+    dim += 4;
+  }
+
+  for (size_t i = 0; i < batch_size; ++i) {
+    float result = vaddvq_f32(v_sum[i]);
+    switch (last - lhs) {
+      case 3:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 2],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 2],
+                         result)
+        /* FALLTHRU */
+      case 2:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 1],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 1],
+                         result)
+        /* FALLTHRU */
+      case 1:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 0],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 0],
+                         result)
+    }
+
+    distances[i] = -result;
+  }
+}
+
+// Dispatch batched inner product over all `n` vectors with prefetching.
+static __attribute__((always_inline)) void squared_euclidean_fp32_batch_armv8(
+    const void *const *vectors, const void *query, size_t n, size_t dim,
+    float *distances) {
+  static constexpr size_t batch_size = 2;
+  static constexpr size_t prefetch_step = 2;
+  size_t i = 0;
+  for (; i + batch_size <= n; i += batch_size) {
+    std::array<const void *, batch_size> prefetch_ptrs;
+    for (size_t j = 0; j < batch_size; ++j) {
+      if (i + j + batch_size * prefetch_step < n) {
+        prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step];
+      } else {
+        prefetch_ptrs[j] = nullptr;
+      }
+    }
+    squared_euclidean_fp32_batch_armv8_impl<batch_size>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+  for (; i < n; i++) {
+    std::array<const void *, 1> prefetch_ptrs{nullptr};
+    squared_euclidean_fp32_batch_armv8_impl<1>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+}
+
+}  // namespace zvec::turbo::armv8::internal
+
+#endif  // defined(__ARM_NEON)
diff --git a/src/turbo/armv8/half_float/cosine.cc b/src/turbo/armv8/half_float/cosine.cc
new file mode 100644
index 000000000..baf39c702
--- /dev/null
+++ b/src/turbo/armv8/half_float/cosine.cc
@@ -0,0 +1,62 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "armv8/half_float/cosine.h"
+#include "armv8/half_float/inner_product.h"
+#include "armv8/half_float/inner_product_common.h"
+
+namespace zvec::turbo::armv8 {
+
+void cosine_fp16_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+#if defined(__ARM_NEON)
+  constexpr size_t extra_dim = 2;
+  size_t original_dim = dim - extra_dim;
+
+  float ip;
+  inner_product_fp16_distance(a, b, original_dim, &ip);
+
+  *distance = 1 - ip;
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __ARM_NEON
+}
+
+void cosine_fp16_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__ARM_NEON)
+  constexpr size_t extra_dim = 2;
+  const int original_dim = dim - extra_dim;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  inner_product_fp16_batch_armv8(vectors, query, n, original_dim, distances);
+
+  for (size_t i = 0; i < n; ++i) {
+    distances[i] = 1 - distances[i];
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__ARM_NEON
+}
+
+}  // namespace zvec::turbo::armv8
\ No newline at end of file
diff --git a/src/turbo/armv8/half_float/cosine.h b/src/turbo/armv8/half_float/cosine.h
new file mode 100644
index 000000000..7d79f7bd7
--- /dev/null
+++ b/src/turbo/armv8/half_float/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::armv8 {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized FP32 vector pair.
+void cosine_fp16_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_fp32_distance.
+void cosine_fp16_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::armv8
\ No newline at end of file
diff --git a/src/turbo/armv8/half_float/inner_product.cc b/src/turbo/armv8/half_float/inner_product.cc
new file mode 100644
index 000000000..7e0dcc448
--- /dev/null
+++ b/src/turbo/armv8/half_float/inner_product.cc
@@ -0,0 +1,58 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+#include <zvec/ailego/utility/float_helper.h>
+#include "armv8/half_float/inner_product.h"
+#include "armv8/half_float/inner_product_common.h"
+
+using namespace zvec::turbo::armv8::internal;
+#endif
+
+namespace zvec::turbo::armv8 {
+
+// Compute squared Euclidean distance between a single quantized FP16
+// vector pair.
+void inner_product_fp16_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__ARM_NEON)
+  const zvec::ailego::Float16 *lhs =
+      reinterpret_cast<const zvec::ailego::Float16 *>(a);
+  const zvec::ailego::Float16 *rhs =
+      reinterpret_cast<const zvec::ailego::Float16 *>(b);
+
+  ACCUM_FP16_1X1_NEON(lhs, rhs, dim, distance, 0ull, )
+
+#endif
+}
+
+// Batch version of inner_product_fp16_distance.
+void inner_product_fp16_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__ARM_NEON)
+  inner_product_fp16_batch_armv8(vectors, query, n, dim, distances);
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__ARM_NEON
+}
+
+}  // namespace zvec::turbo::armv8
diff --git a/src/turbo/armv8/half_float/inner_product.h b/src/turbo/armv8/half_float/inner_product.h
new file mode 100644
index 000000000..cfd824459
--- /dev/null
+++ b/src/turbo/armv8/half_float/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::armv8 {
+
+// Compute inner product distance between a single quantized FP16
+// vector pair.
+void inner_product_fp16_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_fp16_distance.
+void inner_product_fp16_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::armv8
diff --git a/src/turbo/armv8/half_float/inner_product_common.h b/src/turbo/armv8/half_float/inner_product_common.h
new file mode 100644
index 000000000..54c3072ff
--- /dev/null
+++ b/src/turbo/armv8/half_float/inner_product_common.h
@@ -0,0 +1,210 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__ARM_NEON)
+#include <array>
+#include <cstdint>
+#include <arm_neon.h>
+#include <zvec/ailego/utility/float_helper.h>
+
+using namespace zvec::ailego;
+
+namespace zvec::turbo::armv8::internal {
+
+#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT);
+
+#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT)
+
+//! Scalar fused multiply-add for inner product (FP16 general)
+#define ACCUM_FP16_STEP_GENERAL(m, q, sum) sum += (m * q);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+//! NEON fused multiply-add for inner product (FP16)
+#define ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum) \
+  v_sum = vfmaq_f16(v_sum, v_m, v_q);
+
+//! Iterative process of computing distance (FP16, M=1, N=1)
+#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC)   \
+  {                                                    \
+    float16x8_t v_m = vld1q_f16((const float16_t *)m); \
+    float16x8_t v_q = vld1q_f16((const float16_t *)q); \
+    _PROC(v_m, v_q, _RES##_0_0)                        \
+  }
+
+//! Compute the distance between matrix and query (FP16, M=1, N=1)
+#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM)                    \
+  MATRIX_VAR_INIT(1, 1, float16x8_t, v_sum, vdupq_n_f16(0))                  \
+  const Float16 *qe = q + dim;                                               \
+  const Float16 *qe_aligned = q + ((dim >> 3) << 3);                         \
+  for (; q != qe_aligned; m += 8, q += 8) {                                  \
+    MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP16_STEP_NEON)             \
+  }                                                                          \
+  if (qe >= qe_aligned + 4) {                                                \
+    float16x8_t v_m =                                                        \
+        vcombine_f16(vld1_f16((const float16_t *)m),                         \
+                     vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK))));   \
+    float16x8_t v_q =                                                        \
+        vcombine_f16(vld1_f16((const float16_t *)q),                         \
+                     vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK))));   \
+    ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum_0_0)                                \
+    m += 4;                                                                  \
+    q += 4;                                                                  \
+  }                                                                          \
+  float result = vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(v_sum_0_0)), \
+                                      vcvt_high_f32_f16(v_sum_0_0)));        \
+  switch (qe - q) {                                                          \
+    case 3:                                                                  \
+      ACCUM_FP16_STEP_GENERAL(m[2], q[2], result)                            \
+      /* FALLTHRU */                                                         \
+    case 2:                                                                  \
+      ACCUM_FP16_STEP_GENERAL(m[1], q[1], result)                            \
+      /* FALLTHRU */                                                         \
+    case 1:                                                                  \
+      ACCUM_FP16_STEP_GENERAL(m[0], q[0], result)                            \
+  }                                                                          \
+  *out = _NORM(result);
+
+#else
+
+//! NEON fused multiply-add for inner product (FP32)
+#define ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum) \
+  v_sum = vfmaq_f32(v_sum, v_m, v_q);
+
+//! Iterative process of computing distance (FP16, M=1, N=1)
+#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC)     \
+  {                                                      \
+    float16x8_t v_m = vld1q_f16((const float16_t *)m);   \
+    float16x8_t v_q = vld1q_f16((const float16_t *)q);   \
+    float32x4_t v_m_0 = vcvt_f32_f16(vget_low_f16(v_m)); \
+    float32x4_t v_q_0 = vcvt_f32_f16(vget_low_f16(v_q)); \
+    _PROC(v_m_0, v_q_0, _RES##_0_0)                      \
+    v_m_0 = vcvt_high_f32_f16(v_m);                      \
+    v_q_0 = vcvt_high_f32_f16(v_q);                      \
+    _PROC(v_m_0, v_q_0, _RES##_0_0)                      \
+  }
+
+//! Compute the distance between matrix and query (FP16, M=1, N=1)
+#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM)           \
+  MATRIX_VAR_INIT(1, 1, float32x4_t, v_sum, vdupq_n_f32(0))         \
+  const Float16 *qe = q + dim;                                      \
+  const Float16 *qe_aligned = q + ((dim >> 3) << 3);                \
+  for (; q != qe_aligned; m += 8, q += 8) {                         \
+    MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP32_STEP_NEON)    \
+  }                                                                 \
+  if (qe >= qe_aligned + 4) {                                       \
+    float32x4_t v_m = vcvt_f32_f16(vld1_f16((const float16_t *)m)); \
+    float32x4_t v_q = vcvt_f32_f16(vld1_f16((const float16_t *)q)); \
+    ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum_0_0)                       \
+    m += 4;                                                         \
+    q += 4;                                                         \
+  }                                                                 \
+  float result = vaddvq_f32(v_sum_0_0);                             \
+  switch (qe - q) {                                                 \
+    case 3:                                                         \
+      ACCUM_FP16_STEP_GENERAL(m[2], q[2], result)                   \
+      /* FALLTHRU */                                                \
+    case 2:                                                         \
+      ACCUM_FP16_STEP_GENERAL(m[1], q[1], result)                   \
+      /* FALLTHRU */                                                \
+    case 1:                                                         \
+      ACCUM_FP16_STEP_GENERAL(m[0], q[0], result)                   \
+  }                                                                 \
+  *out = _NORM(result);
+
+#endif  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+
+template <size_t batch_size>
+static __attribute__((always_inline)) void inner_product_fp16_batch_armv8_impl(
+    const void *query, const void *const *vectors,
+    const std::array<const void *, batch_size> &prefetch_ptrs,
+    size_t dimensionality, float *distances) {
+  float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) {
+    v_sum[i] = vdupq_n_f32(0);
+  }
+
+  size_t dim = 0;
+  for (; dim + 64 <= dimensionality; dim += 4) {
+    for (size_t i = 0; i < batch_size; ++i) {
+      v_sum[i] = vfmaq_f32(
+          v_sum[i], vld1q_f32(reinterpret_cast<const float *>(query) + dim),
+          vld1q_f32(reinterpret_cast<const float *>(vectors[i]) + dim));
+    }
+  }
+
+  if (dim >= dimensionality + 4) {
+    for (size_t i = 0; i < batch_size; ++i) {
+      v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast<const float *>(query)+dim), vld1q_f32(reinterpret_cast<const float *>(vectors[i])+dim)));
+    }
+
+    dim += 4;
+  }
+
+  for (size_t i = 0; i < batch_size; ++i) {
+    float result = vaddvq_f32(v_sum[i]);
+    switch (last - lhs) {
+      case 3:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 2],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 2],
+                         result)
+        /* FALLTHRU */
+      case 2:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 1],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 1],
+                         result)
+        /* FALLTHRU */
+      case 1:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 0],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 0],
+                         result)
+    }
+
+    distances[i] = -result;
+  }
+}
+
+// Dispatch batched inner product over all `n` vectors with prefetching.
+static __attribute__((always_inline)) void inner_product_fp16_batch_armv8(
+    const void *const *vectors, const void *query, size_t n, size_t dim,
+    float *distances) {
+  static constexpr size_t batch_size = 2;
+  static constexpr size_t prefetch_step = 2;
+  size_t i = 0;
+  for (; i + batch_size <= n; i += batch_size) {
+    std::array<const void *, batch_size> prefetch_ptrs;
+    for (size_t j = 0; j < batch_size; ++j) {
+      if (i + j + batch_size * prefetch_step < n) {
+        prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step];
+      } else {
+        prefetch_ptrs[j] = nullptr;
+      }
+    }
+    inner_product_fp16_batch_armv8_impl<batch_size>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+  for (; i < n; i++) {
+    std::array<const void *, 1> prefetch_ptrs{nullptr};
+    inner_product_fp16_batch_armv8_impl<1>(query, &vectors[i], prefetch_ptrs,
+                                           dim, distances + i);
+  }
+}
+
+}  // namespace zvec::turbo::armv8::internal
+
+#endif  // defined(__ARM_NEON)
diff --git a/src/turbo/armv8/half_float/squared_euclidean.cc b/src/turbo/armv8/half_float/squared_euclidean.cc
new file mode 100644
index 000000000..5f6ac829b
--- /dev/null
+++ b/src/turbo/armv8/half_float/squared_euclidean.cc
@@ -0,0 +1,59 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+#include <zvec/ailego/utility/float_helper.h>
+#include "armv8/half_float/squared_euclidean.h"
+#include "armv8/half_float/squared_euclidean_common.h"
+
+using namespace zvec::turbo::armv8::internal;
+#endif
+
+namespace zvec::turbo::armv8 {
+
+void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__ARM_NEON)
+  const zvec::ailego::Float16 *lhs =
+      reinterpret_cast<const zvec::ailego::Float16 *>(a);
+  const zvec::ailego::Float16 *rhs =
+      reinterpret_cast<const zvec::ailego::Float16 *>(b);
+
+  ACCUM_FP16_1X1_NEON(lhs, rhs, dim, distance, 0ull, )
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __ARM_NEON
+}
+
+void squared_euclidean_fp16_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__ARM_NEON)
+  squared_euclidean_fp16_batch_armv8(vectors, query, n, dim, distances);
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__ARM_NEON
+}
+
+}  // namespace zvec::turbo::armv8
\ No newline at end of file
diff --git a/src/turbo/armv8/half_float/squared_euclidean.h b/src/turbo/armv8/half_float/squared_euclidean.h
new file mode 100644
index 000000000..5a540b590
--- /dev/null
+++ b/src/turbo/armv8/half_float/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::armv8 {
+
+// Compute squared euclidean distance between a single quantized FP16
+// vector pair.
+void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean FP16.
+void squared_euclidean_fp16_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::armv8
diff --git a/src/turbo/armv8/half_float/squared_euclidean_common.h b/src/turbo/armv8/half_float/squared_euclidean_common.h
new file mode 100644
index 000000000..df3807e61
--- /dev/null
+++ b/src/turbo/armv8/half_float/squared_euclidean_common.h
@@ -0,0 +1,219 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__ARM_NEON)
+#include <array>
+#include <cstdint>
+#include <arm_neon.h>
+#include <zvec/ailego/utility/float_helper.h>
+
+using namespace zvec::ailego;
+
+namespace zvec::turbo::armv8::internal {
+
+#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT);
+
+#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT)
+
+//! Scalar sum of squared difference (FP16 general)
+#define ACCUM_FP16_STEP_GENERAL(m, q, sum) \
+  {                                        \
+    float x = m - q;                       \
+    sum += (x * x);                        \
+  }
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+//! NEON sum of squared difference (FP16)
+#define ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum) \
+  {                                           \
+    float16x8_t v_d = vsubq_f16(v_m, v_q);    \
+    v_sum = vfmaq_f16(v_sum, v_d, v_d);       \
+  }
+
+//! Iterative process of computing distance (FP16, M=1, N=1)
+#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC)   \
+  {                                                    \
+    float16x8_t v_m = vld1q_f16((const float16_t *)m); \
+    float16x8_t v_q = vld1q_f16((const float16_t *)q); \
+    _PROC(v_m, v_q, _RES##_0_0)                        \
+  }
+//! Compute the distance between matrix and query (FP16, M=1, N=1)
+#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM)                    \
+  MATRIX_VAR_INIT(1, 1, float16x8_t, v_sum, vdupq_n_f16(0))                  \
+  const Float16 *qe = q + dim;                                               \
+  const Float16 *qe_aligned = q + ((dim >> 3) << 3);                         \
+  for (; q != qe_aligned; m += 8, q += 8) {                                  \
+    MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP16_STEP_NEON)             \
+  }                                                                          \
+  if (qe >= qe_aligned + 4) {                                                \
+    float16x8_t v_m =                                                        \
+        vcombine_f16(vld1_f16((const float16_t *)m),                         \
+                     vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK))));   \
+    float16x8_t v_q =                                                        \
+        vcombine_f16(vld1_f16((const float16_t *)q),                         \
+                     vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK))));   \
+    ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum_0_0)                                \
+    m += 4;                                                                  \
+    q += 4;                                                                  \
+  }                                                                          \
+  float result = vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(v_sum_0_0)), \
+                                      vcvt_high_f32_f16(v_sum_0_0)));        \
+  switch (qe - q) {                                                          \
+    case 3:                                                                  \
+      ACCUM_FP16_STEP_GENERAL(m[2], q[2], result)                            \
+      /* FALLTHRU */                                                         \
+    case 2:                                                                  \
+      ACCUM_FP16_STEP_GENERAL(m[1], q[1], result)                            \
+      /* FALLTHRU */                                                         \
+    case 1:                                                                  \
+      ACCUM_FP16_STEP_GENERAL(m[0], q[0], result)                            \
+  }                                                                          \
+  *out = _NORM(result);
+
+#else
+
+//! NEON sum of squared difference (FP32)
+#define ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum) \
+  {                                           \
+    float32x4_t v_d = vsubq_f32(v_m, v_q);    \
+    v_sum = vfmaq_f32(v_sum, v_d, v_d);       \
+  }
+
+//! Iterative process of computing distance (FP16, M=1, N=1)
+#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC)     \
+  {                                                      \
+    float16x8_t v_m = vld1q_f16((const float16_t *)m);   \
+    float16x8_t v_q = vld1q_f16((const float16_t *)q);   \
+    float32x4_t v_m_0 = vcvt_f32_f16(vget_low_f16(v_m)); \
+    float32x4_t v_q_0 = vcvt_f32_f16(vget_low_f16(v_q)); \
+    _PROC(v_m_0, v_q_0, _RES##_0_0)                      \
+    v_m_0 = vcvt_high_f32_f16(v_m);                      \
+    v_q_0 = vcvt_high_f32_f16(v_q);                      \
+    _PROC(v_m_0, v_q_0, _RES##_0_0)                      \
+  }
+
+//! Compute the distance between matrix and query (FP16, M=1, N=1)
+#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM)           \
+  MATRIX_VAR_INIT(1, 1, float32x4_t, v_sum, vdupq_n_f32(0))         \
+  const Float16 *qe = q + dim;                                      \
+  const Float16 *qe_aligned = q + ((dim >> 3) << 3);                \
+  for (; q != qe_aligned; m += 8, q += 8) {                         \
+    MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP32_STEP_NEON)    \
+  }                                                                 \
+  if (qe >= qe_aligned + 4) {                                       \
+    float32x4_t v_m = vcvt_f32_f16(vld1_f16((const float16_t *)m)); \
+    float32x4_t v_q = vcvt_f32_f16(vld1_f16((const float16_t *)q)); \
+    ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum_0_0)                       \
+    m += 4;                                                         \
+    q += 4;                                                         \
+  }                                                                 \
+  float result = vaddvq_f32(v_sum_0_0);                             \
+  switch (qe - q) {                                                 \
+    case 3:                                                         \
+      ACCUM_FP16_STEP_GENERAL(m[2], q[2], result)                   \
+      /* FALLTHRU */                                                \
+    case 2:                                                         \
+      ACCUM_FP16_STEP_GENERAL(m[1], q[1], result)                   \
+      /* FALLTHRU */                                                \
+    case 1:                                                         \
+      ACCUM_FP16_STEP_GENERAL(m[0], q[0], result)                   \
+  }                                                                 \
+  *out = _NORM(result);
+
+#endif  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+
+template <size_t batch_size>
+static __attribute__((always_inline)) void
+squared_euclidean_fp16_batch_armv8_impl(
+    const void *query, const void *const *vectors,
+    const std::array<const void *, batch_size> &prefetch_ptrs,
+    size_t dimensionality, float *distances) {
+  float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) {
+    v_sum[i] = vdupq_n_f32(0);
+  }
+
+  size_t dim = 0;
+  for (; dim + 64 <= dimensionality; dim += 4) {
+    for (size_t i = 0; i < batch_size; ++i) {
+      v_sum[i] = vfmaq_f32(
+          v_sum[i], vld1q_f32(reinterpret_cast<const float *>(query) + dim),
+          vld1q_f32(reinterpret_cast<const float *>(vectors[i]) + dim));
+    }
+  }
+
+  if (dim >= dimensionality + 4) {
+    for (size_t i = 0; i < batch_size; ++i) {
+      v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast<const float *>(query)+dim), vld1q_f32(reinterpret_cast<const float *>(vectors[i])+dim)));
+    }
+
+    dim += 4;
+  }
+
+  for (size_t i = 0; i < batch_size; ++i) {
+    float result = vaddvq_f32(v_sum[i]);
+    switch (last - lhs) {
+      case 3:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 2],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 2],
+                         result)
+        /* FALLTHRU */
+      case 2:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 1],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 1],
+                         result)
+        /* FALLTHRU */
+      case 1:
+        FMA_FP32_GENERAL(reinterpret_cast<const float *>(query)[dim + 0],
+                         reinterpret_cast<const float *>(vectors[i])[dim + 0],
+                         result)
+    }
+
+    distances[i] = -result;
+  }
+}
+
+// Dispatch batched inner product over all `n` vectors with prefetching.
+static __attribute__((always_inline)) void squared_euclidean_fp16_batch_armv8(
+    const void *const *vectors, const void *query, size_t n, size_t dim,
+    float *distances) {
+  static constexpr size_t batch_size = 2;
+  static constexpr size_t prefetch_step = 2;
+  size_t i = 0;
+  for (; i + batch_size <= n; i += batch_size) {
+    std::array<const void *, batch_size> prefetch_ptrs;
+    for (size_t j = 0; j < batch_size; ++j) {
+      if (i + j + batch_size * prefetch_step < n) {
+        prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step];
+      } else {
+        prefetch_ptrs[j] = nullptr;
+      }
+    }
+    squared_euclidean_fp16_batch_armv8_impl<batch_size>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+  for (; i < n; i++) {
+    std::array<const void *, 1> prefetch_ptrs{nullptr};
+    squared_euclidean_fp16_batch_armv8_impl<1>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+}
+}  // namespace zvec::turbo::armv8::internal
+
+#endif  // defined(__ARM_NEON)
diff --git a/src/turbo/avx/float32/common.h b/src/turbo/avx/float32/common.h
new file mode 100644
index 000000000..acd06f0de
--- /dev/null
+++ b/src/turbo/avx/float32/common.h
@@ -0,0 +1,166 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX__)
+
+#include <immintrin.h>
+#include <array>
+#include <type_traits>
+#include <zvec/ailego/internal/platform.h>
+
+#define SSD_FP32_GENERAL(m, q, sum) \
+  {                                 \
+    float x = m - q;                \
+    sum += (x * x);                 \
+  }
+
+//! Calculate Fused-Multiply-Add (GENERAL)
+#define FMA_FP32_GENERAL(m, q, sum) sum += (m * q);
+
+static inline float HorizontalAdd_FP32_V256(__m256 v) {
+  __m256 x1 = _mm256_hadd_ps(v, v);
+  __m256 x2 = _mm256_hadd_ps(x1, x1);
+  __m128 x3 = _mm256_extractf128_ps(x2, 1);
+  __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3);
+  return _mm_cvtss_f32(x4);
+}
+
+static inline float sum4(__m128 v) {
+  v = _mm_add_ps(v, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 8)));
+  return _mm_cvtss_f32(v) + _mm_cvtss_f32(_mm_shuffle_ps(v, v, 1));
+}
+
+static inline __m128 sum_top_bottom_avx(__m256 v) {
+  const __m128 high = _mm256_extractf128_ps(v, 1);
+  const __m128 low = _mm256_castps256_ps128(v);
+  return _mm_add_ps(high, low);
+}
+
+
+template <typename ValueType, size_t dp_batch>
+static std::enable_if_t<std::is_same_v<ValueType, float>, void>
+inner_product_fp32_batch_avx_impl(
+    const ValueType *query, const ValueType *const *ptrs,
+    std::array<const ValueType *, dp_batch> &prefetch_ptrs,
+    size_t dimensionality, float *results) {
+  __m256 accs[dp_batch];
+  for (size_t i = 0; i < dp_batch; ++i) {
+    accs[i] = _mm256_setzero_ps();
+  }
+  size_t dim = 0;
+  for (; dim + 8 <= dimensionality; dim += 8) {
+    __m256 q = _mm256_loadu_ps(query + dim);
+
+    __m256 data_regs[dp_batch];
+    for (size_t i = 0; i < dp_batch; ++i) {
+      data_regs[i] = _mm256_loadu_ps(ptrs[i] + dim);
+    }
+    if (prefetch_ptrs[0]) {
+      for (size_t i = 0; i < dp_batch; ++i) {
+        ailego_prefetch(prefetch_ptrs[i] + dim);
+      }
+    }
+    for (size_t i = 0; i < dp_batch; ++i) {
+      accs[i] = _mm256_fnmadd_ps(q, data_regs[i], accs[i]);
+    }
+  }
+
+  __m128 sum128_regs[dp_batch];
+  for (size_t i = 0; i < dp_batch; ++i) {
+    sum128_regs[i] = sum_top_bottom_avx(accs[i]);
+  }
+  if (dim + 4 <= dimensionality) {
+    __m128 q = _mm_loadu_ps(query + dim);
+
+    __m128 data_regs[dp_batch];
+    for (size_t i = 0; i < dp_batch; ++i) {
+      data_regs[i] = _mm_loadu_ps(ptrs[i] + dim);
+    }
+    if (prefetch_ptrs[0]) {
+      for (size_t i = 0; i < dp_batch; ++i) {
+        ailego_prefetch(prefetch_ptrs[i] + dim);
+      }
+    }
+    for (size_t i = 0; i < dp_batch; ++i) {
+      sum128_regs[i] = _mm_fnmadd_ps(q, data_regs[i], sum128_regs[i]);
+    }
+    dim += 4;
+  }
+  if (dim + 2 <= dimensionality) {
+    __m128 q = _mm_setzero_ps();
+
+    __m128 data_regs[dp_batch];
+    for (size_t i = 0; i < dp_batch; ++i) {
+      data_regs[i] = _mm_setzero_ps();
+    }
+
+    q = _mm_loadh_pi(q, (const __m64 *)(query + dim));
+    for (size_t i = 0; i < dp_batch; ++i) {
+      data_regs[i] = _mm_loadh_pi(data_regs[i], (const __m64 *)(ptrs[i] + dim));
+    }
+    for (size_t i = 0; i < dp_batch; ++i) {
+      sum128_regs[i] = _mm_fnmadd_ps(q, data_regs[i], sum128_regs[i]);
+    }
+    dim += 2;
+  }
+
+  float res[dp_batch];
+  for (size_t i = 0; i < dp_batch; ++i) {
+    res[i] = sum4(sum128_regs[i]);
+  }
+  if (dim < dimensionality) {
+    float q = query[dim];
+    for (size_t i = 0; i < dp_batch; ++i) {
+      res[i] -= q * ptrs[i][dim];
+    }
+  }
+  for (size_t i = 0; i < dp_batch; ++i) {
+    results[i] = -res[i];
+  }
+}
+
+// Dispatch batched inner product over all `n` vectors with prefetching.
+static __attribute__((always_inline)) void inner_product_fp32_batch_avx(
+    const void *const *vectors, const void *query, size_t n, size_t dim,
+    float *distances) {
+  static constexpr size_t batch_size = 2;
+  static constexpr size_t prefetch_step = 2;
+  const float *typed_query = reinterpret_cast<const float *>(query);
+  size_t i = 0;
+  for (; i + batch_size <= n; i += batch_size) {
+    std::array<const float *, batch_size> prefetch_ptrs;
+    for (size_t j = 0; j < batch_size; ++j) {
+      if (i + j + batch_size * prefetch_step < n) {
+        prefetch_ptrs[j] = reinterpret_cast<const float *>(
+            vectors[i + j + batch_size * prefetch_step]);
+      } else {
+        prefetch_ptrs[j] = nullptr;
+      }
+    }
+    inner_product_fp32_batch_avx_impl<float, batch_size>(
+        typed_query, reinterpret_cast<const float *const *>(&vectors[i]),
+        prefetch_ptrs, dim, distances + i);
+  }
+  for (; i < n; i++) {
+    std::array<const float *, 1> prefetch_ptrs{nullptr};
+    inner_product_fp32_batch_avx_impl<float, 1>(
+        typed_query, reinterpret_cast<const float *const *>(&vectors[i]),
+        prefetch_ptrs, dim, distances + i);
+  }
+}
+
+
+#endif
\ No newline at end of file
diff --git a/src/turbo/avx/float32/cosine.cc b/src/turbo/avx/float32/cosine.cc
new file mode 100644
index 000000000..d2f94f4bf
--- /dev/null
+++ b/src/turbo/avx/float32/cosine.cc
@@ -0,0 +1,66 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx/float32/cosine.h"
+#include "avx/float32/common.h"
+#include "avx/float32/inner_product.h"
+
+#if defined(__AVX__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx {
+
+void cosine_fp32_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+#if defined(__AVX__)
+  constexpr size_t extra_dim = 1;
+  size_t d = dim - extra_dim;
+
+  float ip;
+  inner_product_fp32_distance(a, b, d, &ip);
+
+  *distance = 1 - ip;
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX__
+}
+
+void cosine_fp32_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__AVX__)
+  constexpr size_t extra_dim = 1;
+  const int original_dim = dim - extra_dim;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  inner_product_fp32_batch_distance(vectors, query, n, original_dim, distances);
+
+  for (int i = 0; i < n; ++i) {
+    distances[i] = 1 - distances[i];
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX__
+}
+
+}  // namespace zvec::turbo::avx
\ No newline at end of file
diff --git a/src/turbo/avx/float32/cosine.h b/src/turbo/avx/float32/cosine.h
new file mode 100644
index 000000000..514a705e0
--- /dev/null
+++ b/src/turbo/avx/float32/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized FP32 vector pair.
+void cosine_fp32_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_fp32_distance.
+void cosine_fp32_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx
\ No newline at end of file
diff --git a/src/turbo/avx/float32/inner_product.cc b/src/turbo/avx/float32/inner_product.cc
new file mode 100644
index 000000000..10b30eee3
--- /dev/null
+++ b/src/turbo/avx/float32/inner_product.cc
@@ -0,0 +1,120 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx/float32/inner_product.h"
+#include "avx/float32/common.h"
+
+#if defined(__AVX__)
+#include <immintrin.h>
+#include <cstdint>
+#endif
+
+namespace zvec::turbo::avx {
+
+// Compute inner product distance between a single quantized FP32
+// vector pair.
+void inner_product_fp32_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__AVX__)
+  const float *lhs = reinterpret_cast<const float *>(a);
+  const float *rhs = reinterpret_cast<const float *>(b);
+
+  const float *last = lhs + dim;
+  const float *last_aligned = lhs + ((dim >> 4) << 4);
+
+  __m256 ymm_sum_0 = _mm256_setzero_ps();
+  __m256 ymm_sum_1 = _mm256_setzero_ps();
+
+  if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) {
+    for (; lhs != last_aligned; lhs += 16, rhs += 16) {
+      __m256 ymm_lhs_0 = _mm256_load_ps(lhs + 0);
+      __m256 ymm_lhs_1 = _mm256_load_ps(lhs + 8);
+      __m256 ymm_rhs_0 = _mm256_load_ps(rhs + 0);
+      __m256 ymm_rhs_1 = _mm256_load_ps(rhs + 8);
+      ymm_sum_0 = _mm256_fmadd_ps(ymm_lhs_0, ymm_rhs_0, ymm_sum_0);
+      ymm_sum_1 = _mm256_fmadd_ps(ymm_lhs_1, ymm_rhs_1, ymm_sum_1);
+    }
+
+    if (last >= last_aligned + 8) {
+      ymm_sum_0 =
+          _mm256_fmadd_ps(_mm256_load_ps(lhs), _mm256_load_ps(rhs), ymm_sum_0);
+      lhs += 8;
+      rhs += 8;
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 16, rhs += 16) {
+      __m256 ymm_lhs_0 = _mm256_loadu_ps(lhs + 0);
+      __m256 ymm_lhs_1 = _mm256_loadu_ps(lhs + 8);
+      __m256 ymm_rhs_0 = _mm256_loadu_ps(rhs + 0);
+      __m256 ymm_rhs_1 = _mm256_loadu_ps(rhs + 8);
+      ymm_sum_0 = _mm256_fmadd_ps(ymm_lhs_0, ymm_rhs_0, ymm_sum_0);
+      ymm_sum_1 = _mm256_fmadd_ps(ymm_lhs_1, ymm_rhs_1, ymm_sum_1);
+    }
+
+    if (last >= last_aligned + 8) {
+      ymm_sum_0 = _mm256_fmadd_ps(_mm256_loadu_ps(lhs), _mm256_loadu_ps(rhs),
+                                  ymm_sum_0);
+      lhs += 8;
+      rhs += 8;
+    }
+  }
+  float result = HorizontalAdd_FP32_V256(_mm256_add_ps(ymm_sum_0, ymm_sum_1));
+
+  switch (last - lhs) {
+    case 7:
+      FMA_FP32_GENERAL(lhs[6], rhs[6], result)
+      /* FALLTHRU */
+    case 6:
+      FMA_FP32_GENERAL(lhs[5], rhs[5], result)
+      /* FALLTHRU */
+    case 5:
+      FMA_FP32_GENERAL(lhs[4], rhs[4], result)
+      /* FALLTHRU */
+    case 4:
+      FMA_FP32_GENERAL(lhs[3], rhs[3], result)
+      /* FALLTHRU */
+    case 3:
+      FMA_FP32_GENERAL(lhs[2], rhs[2], result)
+      /* FALLTHRU */
+    case 2:
+      FMA_FP32_GENERAL(lhs[1], rhs[1], result)
+      /* FALLTHRU */
+    case 1:
+      FMA_FP32_GENERAL(lhs[0], rhs[0], result)
+  }
+  *distance = -1 * result;
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX__
+}
+
+// Batch version of inner_product_fp32_distance.
+void inner_product_fp32_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__AVX__)
+  inner_product_fp32_batch_avx(vectors, query, n, dim, distances);
+#else
+  (void)vectors;
+  (void)distances;
+  (void)query;
+  (void)n;
+  (void)dim;
+#endif  // __AVX__
+}
+
+}  // namespace zvec::turbo::avx
\ No newline at end of file
diff --git a/src/turbo/avx/float32/inner_product.h b/src/turbo/avx/float32/inner_product.h
new file mode 100644
index 000000000..083a35f6f
--- /dev/null
+++ b/src/turbo/avx/float32/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx {
+
+// Compute inner product distance between a single quantized FP32
+// vector pair.
+void inner_product_fp32_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_fp32_distance.
+void inner_product_fp32_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::avx
diff --git a/src/turbo/avx/float32/squared_euclidean.cc b/src/turbo/avx/float32/squared_euclidean.cc
new file mode 100644
index 000000000..9240ea7e9
--- /dev/null
+++ b/src/turbo/avx/float32/squared_euclidean.cc
@@ -0,0 +1,121 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx/float32/squared_euclidean.h"
+#include "avx/float32/common.h"
+
+#if defined(__AVX__)
+#include <immintrin.h>
+#include <cstdint>
+#endif
+
+namespace zvec::turbo::avx {
+
+void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__AVX__)
+  const float *lhs = reinterpret_cast<const float *>(a);
+  const float *rhs = reinterpret_cast<const float *>(b);
+
+  const float *last = lhs + dim;
+  const float *last_aligned = lhs + ((dim >> 4) << 4);
+
+  __m256 ymm_sum_0 = _mm256_setzero_ps();
+  __m256 ymm_sum_1 = _mm256_setzero_ps();
+
+  if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) {
+    for (; lhs != last_aligned; lhs += 16, rhs += 16) {
+      __m256 ymm_d_0 =
+          _mm256_sub_ps(_mm256_load_ps(lhs + 0), _mm256_load_ps(rhs + 0));
+      __m256 ymm_d_1 =
+          _mm256_sub_ps(_mm256_load_ps(lhs + 8), _mm256_load_ps(rhs + 8));
+      ymm_sum_0 = _mm256_fmadd_ps(ymm_d_0, ymm_d_0, ymm_sum_0);
+      ymm_sum_1 = _mm256_fmadd_ps(ymm_d_1, ymm_d_1, ymm_sum_1);
+    }
+
+    if (last >= last_aligned + 8) {
+      __m256 ymm_d = _mm256_sub_ps(_mm256_load_ps(lhs), _mm256_load_ps(rhs));
+      ymm_sum_0 = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum_0);
+      lhs += 8;
+      rhs += 8;
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 16, rhs += 16) {
+      __m256 ymm_d_0 =
+          _mm256_sub_ps(_mm256_loadu_ps(lhs + 0), _mm256_loadu_ps(rhs + 0));
+      __m256 ymm_d_1 =
+          _mm256_sub_ps(_mm256_loadu_ps(lhs + 8), _mm256_loadu_ps(rhs + 8));
+      ymm_sum_0 = _mm256_fmadd_ps(ymm_d_0, ymm_d_0, ymm_sum_0);
+      ymm_sum_1 = _mm256_fmadd_ps(ymm_d_1, ymm_d_1, ymm_sum_1);
+    }
+
+    if (last >= last_aligned + 8) {
+      __m256 ymm_d = _mm256_sub_ps(_mm256_loadu_ps(lhs), _mm256_loadu_ps(rhs));
+      ymm_sum_0 = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum_0);
+      lhs += 8;
+      rhs += 8;
+    }
+  }
+  float result = HorizontalAdd_FP32_V256(_mm256_add_ps(ymm_sum_0, ymm_sum_1));
+
+  switch (last - lhs) {
+    case 7:
+      SSD_FP32_GENERAL(lhs[6], rhs[6], result)
+      /* FALLTHRU */
+    case 6:
+      SSD_FP32_GENERAL(lhs[5], rhs[5], result)
+      /* FALLTHRU */
+    case 5:
+      SSD_FP32_GENERAL(lhs[4], rhs[4], result)
+      /* FALLTHRU */
+    case 4:
+      SSD_FP32_GENERAL(lhs[3], rhs[3], result)
+      /* FALLTHRU */
+    case 3:
+      SSD_FP32_GENERAL(lhs[2], rhs[2], result)
+      /* FALLTHRU */
+    case 2:
+      SSD_FP32_GENERAL(lhs[1], rhs[1], result)
+      /* FALLTHRU */
+    case 1:
+      SSD_FP32_GENERAL(lhs[0], rhs[0], result)
+  }
+
+  *distance = result;
+
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX__
+}
+
+void squared_euclidean_fp32_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__AVX__)
+  for (size_t i = 0; i < n; ++i) {
+    squared_euclidean_fp32_distance(vectors[i], query, dim, &distances[i]);
+  }
+#else
+  (void)vectors;
+  (void)distances;
+  (void)query;
+  (void)n;
+  (void)dim;
+#endif  // __AVX__
+}
+
+}  // namespace zvec::turbo::avx
\ No newline at end of file
diff --git a/src/turbo/avx/float32/squared_euclidean.h b/src/turbo/avx/float32/squared_euclidean.h
new file mode 100644
index 000000000..9e11f15bc
--- /dev/null
+++ b/src/turbo/avx/float32/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx {
+
+// Compute squared euclidean distance between a single quantized FP32
+// vector pair.
+void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean FP32.
+void squared_euclidean_fp32_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx
diff --git a/src/turbo/avx/half_float/cosine.cc b/src/turbo/avx/half_float/cosine.cc
new file mode 100644
index 000000000..27a3c7dbd
--- /dev/null
+++ b/src/turbo/avx/half_float/cosine.cc
@@ -0,0 +1,67 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx/half_float/cosine.h"
+#include "avx/half_float/inner_product.h"
+#include "avx/half_float/inner_product_common.h"
+
+#if defined(__AVX__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx {
+
+void cosine_fp16_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+#if defined(__AVX__)
+  constexpr size_t extra_dim = 2;
+  size_t d = dim - extra_dim;
+
+  float ip;
+  inner_product_fp16_distance(a, b, d, &ip);
+
+  *distance = 1 - ip;
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX__
+}
+
+void cosine_fp16_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__AVX__)
+  constexpr size_t extra_dim = 2;
+  const int original_dim = dim - extra_dim;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  inner_product_fp16_batch_distance(vectors, query, n, original_dim, distances);
+
+  for (size_t i = 0; i < n; ++i) {
+    distances[i] = 1 - distances[i];
+  }
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX__
+}
+
+}  // namespace zvec::turbo::avx
\ No newline at end of file
diff --git a/src/turbo/avx/half_float/cosine.h b/src/turbo/avx/half_float/cosine.h
new file mode 100644
index 000000000..5bd0a66f5
--- /dev/null
+++ b/src/turbo/avx/half_float/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized FP16 vector pair.
+void cosine_fp16_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_fp16_distance.
+void cosine_fp16_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx
\ No newline at end of file
diff --git a/src/turbo/avx/half_float/inner_product.cc b/src/turbo/avx/half_float/inner_product.cc
new file mode 100644
index 000000000..4ac05de2a
--- /dev/null
+++ b/src/turbo/avx/half_float/inner_product.cc
@@ -0,0 +1,58 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx/half_float/inner_product.h"
+#include "avx/half_float/inner_product_common.h"
+
+#if defined(__AVX__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx {
+
+// Compute squared Euclidean distance between a single quantized FP16
+// vector pair.
+void inner_product_fp16_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__AVX__)
+  const ailego::Float16 *lhs = reinterpret_cast<const ailego::Float16 *>(a);
+  const ailego::Float16 *rhs = reinterpret_cast<const ailego::Float16 *>(b);
+
+  ACCUM_FP16_1X1_AVX(lhs, rhs, dim, distance, 0ull, NEGATE_FP32_GENERAL)
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX__
+}
+
+// Batch version of inner_product_fp16_distance.
+void inner_product_fp16_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__AVX__)
+  for (size_t i = 0; i < n; ++i) {
+    inner_product_fp16_distance(vectors[i], query, dim, &distances[i]);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  // __AVX__
+}
+
+}  // namespace zvec::turbo::avx
\ No newline at end of file
diff --git a/src/turbo/avx/half_float/inner_product.h b/src/turbo/avx/half_float/inner_product.h
new file mode 100644
index 000000000..08b5a8d73
--- /dev/null
+++ b/src/turbo/avx/half_float/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx {
+
+// Compute inner product distance between a single quantized FP16
+// vector pair.
+void inner_product_fp16_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_fp16_distance.
+void inner_product_fp16_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::avx
diff --git a/src/turbo/avx/half_float/inner_product_common.h b/src/turbo/avx/half_float/inner_product_common.h
new file mode 100644
index 000000000..a6816d022
--- /dev/null
+++ b/src/turbo/avx/half_float/inner_product_common.h
@@ -0,0 +1,179 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX__)
+
+#include <immintrin.h>
+#include <zvec/ailego/utility/float_helper.h>
+
+using namespace zvec::ailego;
+
+namespace zvec::turbo::avx {
+//! Reverse sign of value (GENERAL)
+#define NEGATE_FP32_GENERAL(v) -(v)
+
+//! Mask process of computing distance (FP16)
+#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC)              \
+  switch (cnt) {                                                             \
+    case 7: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), *((const short *)(lhs) + 6),                       \
+          *((const short *)(lhs) + 5), *((const short *)(lhs) + 4),          \
+          *((const short *)(lhs) + 3), *((const short *)(lhs) + 2),          \
+          *((const short *)(lhs) + 1), *((const short *)(lhs))));            \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), *((const short *)(rhs) + 6),                       \
+          *((const short *)(rhs) + 5), *((const short *)(rhs) + 4),          \
+          *((const short *)(rhs) + 3), *((const short *)(rhs) + 2),          \
+          *((const short *)(rhs) + 1), *((const short *)(rhs))));            \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 6: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2),             \
+                        *((const int *)(lhs) + 1), *((const int *)(lhs))));  \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2),             \
+                        *((const int *)(rhs) + 1), *((const int *)(rhs))));  \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 5: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK),                    \
+          *((const short *)(lhs) + 4), *((const short *)(lhs) + 3),          \
+          *((const short *)(lhs) + 2), *((const short *)(lhs) + 1),          \
+          *((const short *)(lhs))));                                         \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK),                    \
+          *((const short *)(rhs) + 4), *((const short *)(rhs) + 3),          \
+          *((const short *)(rhs) + 2), *((const short *)(rhs) + 1),          \
+          *((const short *)(rhs))));                                         \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 4: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs))));           \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs))));           \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 3: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK),    \
+          (short)(_MASK), *((const short *)(lhs) + 2),                       \
+          *((const short *)(lhs) + 1), *((const short *)(lhs))));            \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK),    \
+          (short)(_MASK), *((const short *)(rhs) + 2),                       \
+          *((const short *)(rhs) + 1), *((const short *)(rhs))));            \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 2: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32(                        \
+          (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32(                        \
+          (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 1: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK),             \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK),      \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK)));    \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK),             \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK),      \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK)));    \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+  }
+
+static inline float HorizontalAdd_FP32_V256(__m256 v) {
+  __m256 x1 = _mm256_hadd_ps(v, v);
+  __m256 x2 = _mm256_hadd_ps(x1, x1);
+  __m128 x3 = _mm256_extractf128_ps(x2, 1);
+  __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3);
+  return _mm_cvtss_f32(x4);
+}
+
+//! Calculate Fused-Multiply-Add (AVX)
+#define FMA_FP32_AVX(ymm_m, ymm_q, ymm_sum) \
+  ymm_sum = _mm256_fmadd_ps(ymm_m, ymm_q, ymm_sum);
+
+#define ACCUM_FP32_STEP_AVX FMA_FP32_AVX
+
+#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT);
+
+#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT)
+
+//! Iterative process of computing distance (FP16, M=1, N=1)
+#define MATRIX_FP16_ITER_1X1_AVX(m, q, _RES, _LOAD, _PROC)          \
+  {                                                                 \
+    __m256i ymm_mi = _LOAD((const __m256i *)m);                     \
+    __m256i ymm_qi = _LOAD((const __m256i *)q);                     \
+    __m256 ymm_m = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_mi)); \
+    __m256 ymm_q = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_qi)); \
+    _PROC(ymm_m, ymm_q, _RES##_0_0);                                \
+    ymm_m = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_mi, 1));   \
+    ymm_q = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_qi, 1));   \
+    _PROC(ymm_m, ymm_q, _RES##_0_0);                                \
+  }
+
+//! Compute the distance between matrix and query (FP16, M=1, N=1)
+#define ACCUM_FP16_1X1_AVX(m, q, dim, out, _MASK, _NORM)                    \
+  MATRIX_VAR_INIT(1, 1, __m256, ymm_sum, _mm256_setzero_ps())               \
+  const Float16 *qe = q + dim;                                              \
+  const Float16 *qe_aligned = q + ((dim >> 4) << 4);                        \
+  if (((uintptr_t)m & 0x1f) == 0 && ((uintptr_t)q & 0x1f) == 0) {           \
+    for (; q != qe_aligned; m += 16, q += 16) {                             \
+      MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_load_si256,            \
+                               ACCUM_FP32_STEP_AVX)                         \
+    }                                                                       \
+    if (qe >= qe_aligned + 8) {                                             \
+      __m256 ymm_m = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)m));   \
+      __m256 ymm_q = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)q));   \
+      ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0)                        \
+      m += 8;                                                               \
+      q += 8;                                                               \
+    }                                                                       \
+  } else {                                                                  \
+    for (; q != qe_aligned; m += 16, q += 16) {                             \
+      MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_loadu_si256,           \
+                               ACCUM_FP32_STEP_AVX)                         \
+    }                                                                       \
+    if (qe >= qe_aligned + 8) {                                             \
+      __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m));  \
+      __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q));  \
+      ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0)                        \
+      m += 8;                                                               \
+      q += 8;                                                               \
+    }                                                                       \
+  }                                                                         \
+  MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \
+  *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0));
+
+}  // namespace zvec::turbo::avx
+
+#endif
\ No newline at end of file
diff --git a/src/turbo/avx/half_float/squared_euclidean.cc b/src/turbo/avx/half_float/squared_euclidean.cc
new file mode 100644
index 000000000..24913891c
--- /dev/null
+++ b/src/turbo/avx/half_float/squared_euclidean.cc
@@ -0,0 +1,55 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx/half_float/squared_euclidean.h"
+#include "avx/half_float/squared_euclidean_common.h"
+
+#if defined(__AVX__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx {
+
+void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__AVX__)
+  const ailego::Float16 *lhs = reinterpret_cast<const ailego::Float16 *>(a);
+  const ailego::Float16 *rhs = reinterpret_cast<const ailego::Float16 *>(b);
+
+  ACCUM_FP16_1X1_AVX(lhs, rhs, dim, distance, 0ull, )
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX__
+}
+
+void squared_euclidean_fp16_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__AVX__)
+  for (size_t i = 0; i < n; ++i) {
+    squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX__
+}
+
+}  // namespace zvec::turbo::avx
\ No newline at end of file
diff --git a/src/turbo/avx/half_float/squared_euclidean.h b/src/turbo/avx/half_float/squared_euclidean.h
new file mode 100644
index 000000000..013b1f118
--- /dev/null
+++ b/src/turbo/avx/half_float/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx {
+
+// Compute squared euclidean distance between a single quantized FP32
+// vector pair.
+void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean FP32.
+void squared_euclidean_fp16_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx
diff --git a/src/turbo/avx/half_float/squared_euclidean_common.h b/src/turbo/avx/half_float/squared_euclidean_common.h
new file mode 100644
index 000000000..8e58393d7
--- /dev/null
+++ b/src/turbo/avx/half_float/squared_euclidean_common.h
@@ -0,0 +1,180 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX__)
+
+#include <immintrin.h>
+#include <zvec/ailego/utility/float_helper.h>
+
+using namespace zvec::ailego;
+
+namespace zvec::turbo::avx {
+
+//! Mask process of computing distance (FP16)
+#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC)              \
+  switch (cnt) {                                                             \
+    case 7: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), *((const short *)(lhs) + 6),                       \
+          *((const short *)(lhs) + 5), *((const short *)(lhs) + 4),          \
+          *((const short *)(lhs) + 3), *((const short *)(lhs) + 2),          \
+          *((const short *)(lhs) + 1), *((const short *)(lhs))));            \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), *((const short *)(rhs) + 6),                       \
+          *((const short *)(rhs) + 5), *((const short *)(rhs) + 4),          \
+          *((const short *)(rhs) + 3), *((const short *)(rhs) + 2),          \
+          *((const short *)(rhs) + 1), *((const short *)(rhs))));            \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 6: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2),             \
+                        *((const int *)(lhs) + 1), *((const int *)(lhs))));  \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2),             \
+                        *((const int *)(rhs) + 1), *((const int *)(rhs))));  \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 5: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK),                    \
+          *((const short *)(lhs) + 4), *((const short *)(lhs) + 3),          \
+          *((const short *)(lhs) + 2), *((const short *)(lhs) + 1),          \
+          *((const short *)(lhs))));                                         \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK),                    \
+          *((const short *)(rhs) + 4), *((const short *)(rhs) + 3),          \
+          *((const short *)(rhs) + 2), *((const short *)(rhs) + 1),          \
+          *((const short *)(rhs))));                                         \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 4: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs))));           \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs))));           \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 3: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK),    \
+          (short)(_MASK), *((const short *)(lhs) + 2),                       \
+          *((const short *)(lhs) + 1), *((const short *)(lhs))));            \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK),    \
+          (short)(_MASK), *((const short *)(rhs) + 2),                       \
+          *((const short *)(rhs) + 1), *((const short *)(rhs))));            \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 2: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32(                        \
+          (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32(                        \
+          (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 1: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK),             \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK),      \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK)));    \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK),             \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK),      \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK)));    \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+  }
+
+static inline float HorizontalAdd_FP32_V256(__m256 v) {
+  __m256 x1 = _mm256_hadd_ps(v, v);
+  __m256 x2 = _mm256_hadd_ps(x1, x1);
+  __m128 x3 = _mm256_extractf128_ps(x2, 1);
+  __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3);
+  return _mm_cvtss_f32(x4);
+}
+
+//! Calculate sum of squared difference (AVX)
+#define SSD_FP32_AVX(ymm_m, ymm_q, ymm_sum)           \
+  {                                                   \
+    __m256 ymm_d = _mm256_sub_ps(ymm_m, ymm_q);       \
+    ymm_sum = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum); \
+  }
+
+#define ACCUM_FP32_STEP_AVX SSD_FP32_AVX
+
+#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT);
+
+#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT)
+
+//! Iterative process of computing distance (FP16, M=1, N=1)
+#define MATRIX_FP16_ITER_1X1_AVX(m, q, _RES, _LOAD, _PROC)          \
+  {                                                                 \
+    __m256i ymm_mi = _LOAD((const __m256i *)m);                     \
+    __m256i ymm_qi = _LOAD((const __m256i *)q);                     \
+    __m256 ymm_m = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_mi)); \
+    __m256 ymm_q = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_qi)); \
+    _PROC(ymm_m, ymm_q, _RES##_0_0);                                \
+    ymm_m = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_mi, 1));   \
+    ymm_q = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_qi, 1));   \
+    _PROC(ymm_m, ymm_q, _RES##_0_0);                                \
+  }
+
+//! Compute the distance between matrix and query (FP16, M=1, N=1)
+#define ACCUM_FP16_1X1_AVX(m, q, dim, out, _MASK, _NORM)                    \
+  MATRIX_VAR_INIT(1, 1, __m256, ymm_sum, _mm256_setzero_ps())               \
+  const Float16 *qe = q + dim;                                              \
+  const Float16 *qe_aligned = q + ((dim >> 4) << 4);                        \
+  if (((uintptr_t)m & 0x1f) == 0 && ((uintptr_t)q & 0x1f) == 0) {           \
+    for (; q != qe_aligned; m += 16, q += 16) {                             \
+      MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_load_si256,            \
+                               ACCUM_FP32_STEP_AVX)                         \
+    }                                                                       \
+    if (qe >= qe_aligned + 8) {                                             \
+      __m256 ymm_m = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)m));   \
+      __m256 ymm_q = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)q));   \
+      ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0)                        \
+      m += 8;                                                               \
+      q += 8;                                                               \
+    }                                                                       \
+  } else {                                                                  \
+    for (; q != qe_aligned; m += 16, q += 16) {                             \
+      MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_loadu_si256,           \
+                               ACCUM_FP32_STEP_AVX)                         \
+    }                                                                       \
+    if (qe >= qe_aligned + 8) {                                             \
+      __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m));  \
+      __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q));  \
+      ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0)                        \
+      m += 8;                                                               \
+      q += 8;                                                               \
+    }                                                                       \
+  }                                                                         \
+  MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \
+  *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0));
+
+}  // namespace zvec::turbo::avx
+
+#endif
\ No newline at end of file
diff --git a/src/turbo/avx2/half_float_converter/common.h b/src/turbo/avx2/half_float_converter/common.h
new file mode 100644
index 000000000..1b05591e8
--- /dev/null
+++ b/src/turbo/avx2/half_float_converter/common.h
@@ -0,0 +1,26 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+
+namespace zvec::turbo::avx2::internal {
+
+}  // namespace zvec::turbo::avx2::internal
+
+#endif  // defined(__AVX2__)
diff --git a/src/turbo/avx2/record_quantized_int4/cosine.cc b/src/turbo/avx2/record_quantized_int4/cosine.cc
new file mode 100644
index 000000000..21e05b2c0
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int4/cosine.cc
@@ -0,0 +1,95 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx2/record_quantized_int4/cosine.h"
+#include "avx2/record_quantized_int4/inner_product_common.h"
+#if defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx2 {
+
+void cosine_int4_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+#if defined(__AVX2__)
+  const int d = dim - 40;
+  const size_t original_dim = d >> 1;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_avx2(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms +
+                static_cast<float>(d) * qb * mb);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX2__
+}
+
+void cosine_int4_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__AVX2__)
+  const int d = dim - 40;
+  const size_t original_dim = d >> 1;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_batch_avx2(vectors, query, n, original_dim,
+                                          distances);
+
+  const float *q_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(query) + original_dim);
+  float qa = q_tail[0];
+  float qb = q_tail[1];
+  float qs = q_tail[2];
+
+  for (int i = 0; i < n; ++i) {
+    const float *m_tail = reinterpret_cast<const float *>(
+        reinterpret_cast<const uint8_t *>(vectors[i]) + original_dim);
+    float ma = m_tail[0];
+    float mb = m_tail[1];
+    float ms = m_tail[2];
+
+    float &result = distances[i];
+    result = -(ma * qa * result + mb * qa * qs + qb * ma * ms +
+               static_cast<float>(d) * qb * mb);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX2__
+}
+
+}  // namespace zvec::turbo::avx2
\ No newline at end of file
diff --git a/src/turbo/avx2/record_quantized_int4/cosine.h b/src/turbo/avx2/record_quantized_int4/cosine.h
new file mode 100644
index 000000000..77b4adad9
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int4/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx2 {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized INT4 vector pair.
+void cosine_int4_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_int4_distance.
+void cosine_int4_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx2
\ No newline at end of file
diff --git a/src/turbo/avx2/record_quantized_int4/inner_product.cc b/src/turbo/avx2/record_quantized_int4/inner_product.cc
new file mode 100644
index 000000000..e70cf2ed1
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int4/inner_product.cc
@@ -0,0 +1,76 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx2/record_quantized_int4/inner_product.h"
+#include "avx2/record_quantized_int4/inner_product_common.h"
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx2 {
+
+// Compute squared Euclidean distance between a single quantized INT4
+// vector pair.
+void inner_product_int4_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__AVX2__)
+  const int d = dim - 32;
+  const size_t original_dim = d >> 1;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_avx2(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance =
+      -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + d * qb * mb);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  //__AVX2__
+}
+
+// Batch version of inner_product_int4_distance.
+void inner_product_int4_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__AVX2__)
+  internal::inner_product_int4_batch_avx2(vectors, query, n, dim, distances);
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX2__
+}
+
+}  // namespace zvec::turbo::avx2
\ No newline at end of file
diff --git a/src/turbo/avx2/record_quantized_int4/inner_product.h b/src/turbo/avx2/record_quantized_int4/inner_product.h
new file mode 100644
index 000000000..0e9e69d63
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int4/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx2 {
+
+// Compute inner product distance between a single quantized INT4
+// vector pair.
+void inner_product_int4_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_int4_distance.
+void inner_product_int4_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::avx2
diff --git a/src/turbo/avx2/record_quantized_int4/inner_product_common.h b/src/turbo/avx2/record_quantized_int4/inner_product_common.h
new file mode 100644
index 000000000..8c96f5fb0
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int4/inner_product_common.h
@@ -0,0 +1,250 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+#include <zvec/ailego/internal/platform.h>
+
+namespace zvec::turbo::avx2::internal {
+
+
+/*! Four-bits Integer Multiplication Table
+ */
+static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = {
+    0, 0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0, 1,  2,   3,   4,   5,   6,   7,   -8,  -7,  -6,  -5,  -4,  -3,  -2,  -1,
+    0, 2,  4,   6,   8,   10,  12,  14,  -16, -14, -12, -10, -8,  -6,  -4,  -2,
+    0, 3,  6,   9,   12,  15,  18,  21,  -24, -21, -18, -15, -12, -9,  -6,  -3,
+    0, 4,  8,   12,  16,  20,  24,  28,  -32, -28, -24, -20, -16, -12, -8,  -4,
+    0, 5,  10,  15,  20,  25,  30,  35,  -40, -35, -30, -25, -20, -15, -10, -5,
+    0, 6,  12,  18,  24,  30,  36,  42,  -48, -42, -36, -30, -24, -18, -12, -6,
+    0, 7,  14,  21,  28,  35,  42,  49,  -56, -49, -42, -35, -28, -21, -14, -7,
+    0, -8, -16, -24, -32, -40, -48, -56, 64,  56,  48,  40,  32,  24,  16,  8,
+    0, -7, -14, -21, -28, -35, -42, -49, 56,  49,  42,  35,  28,  21,  14,  7,
+    0, -6, -12, -18, -24, -30, -36, -42, 48,  42,  36,  30,  24,  18,  12,  6,
+    0, -5, -10, -15, -20, -25, -30, -35, 40,  35,  30,  25,  20,  15,  10,  5,
+    0, -4, -8,  -12, -16, -20, -24, -28, 32,  28,  24,  20,  16,  12,  8,   4,
+    0, -3, -6,  -9,  -12, -15, -18, -21, 24,  21,  18,  15,  12,  9,   6,   3,
+    0, -2, -4,  -6,  -8,  -10, -12, -14, 16,  14,  12,  10,  8,   6,   4,   2,
+    0, -1, -2,  -3,  -4,  -5,  -6,  -7,  8,   7,   6,   5,   4,   3,   2,   1,
+};
+
+//! Calculate Fused-Multiply-Add (GENERAL)
+#define FMA_INT4_GENERAL(m, q, sum)                               \
+  sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \
+         Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)];
+
+static inline int32_t HorizontalAdd_INT32_V256(__m256i v) {
+  __m256i x1 = _mm256_hadd_epi32(v, v);
+  __m256i x2 = _mm256_hadd_epi32(x1, x1);
+  __m128i x3 = _mm256_extractf128_si256(x2, 1);
+  __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3);
+  return _mm_cvtsi128_si32(x4);
+}
+
+#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f)
+#define ONES_INT16_SSE _mm_set1_epi32(0x00010001)
+
+#define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0)
+#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001)
+
+static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = {
+    0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1,
+    0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1};
+
+#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable)
+
+#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable)
+
+#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable)
+
+//! Compute the distance between matrix and query
+#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum)                       \
+  {                                                                        \
+    __m128i xmm_lhs_0 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE));         \
+    __m128i xmm_rhs_0 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE));         \
+    __m128i xmm_lhs_1 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE,                                                   \
+        _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE));       \
+    __m128i xmm_rhs_1 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE,                                                   \
+        _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE));       \
+    xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0);                       \
+    xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1);                       \
+    xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0);                                   \
+    xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1);                                   \
+    xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0),    \
+                               ONES_INT16_SSE);                            \
+    xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1),    \
+                               ONES_INT16_SSE);                            \
+    xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \
+  }
+
+#define FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum)                          \
+  {                                                                           \
+    __m256i ymm_lhs_0 = _mm256_shuffle_epi8(                                  \
+        INT4_LOOKUP_AVX, _mm256_and_si256((ymm_lhs), MASK_INT4_AVX));         \
+    __m256i ymm_rhs_0 = _mm256_shuffle_epi8(                                  \
+        INT4_LOOKUP_AVX, _mm256_and_si256((ymm_rhs), MASK_INT4_AVX));         \
+    __m256i ymm_lhs_1 = _mm256_shuffle_epi8(                                  \
+        INT4_LOOKUP_AVX,                                                      \
+        _mm256_and_si256(_mm256_srli_epi32((ymm_lhs), 4), MASK_INT4_AVX));    \
+    __m256i ymm_rhs_1 = _mm256_shuffle_epi8(                                  \
+        INT4_LOOKUP_AVX,                                                      \
+        _mm256_and_si256(_mm256_srli_epi32((ymm_rhs), 4), MASK_INT4_AVX));    \
+    ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0);                       \
+    ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1);                       \
+    ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0);                                   \
+    ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1);                                   \
+    ymm_lhs_0 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), \
+                                  ONES_INT16_AVX);                            \
+    ymm_lhs_1 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), \
+                                  ONES_INT16_AVX);                            \
+    ymm_sum =                                                                 \
+        _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum);    \
+  }
+
+#if defined(__SSE2__)
+static inline int32_t HorizontalAdd_INT32_V128(__m128i v) {
+#ifdef __SSE3__
+  __m128i x1 = _mm_hadd_epi32(v, v);
+  __m128i x2 = _mm_hadd_epi32(x1, x1);
+  return _mm_cvtsi128_si32(x2);
+#else
+  __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2));
+  __m128i x2 = _mm_add_epi32(v, x1);
+  __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1));
+  __m128i x4 = _mm_add_epi32(x2, x3);
+  return _mm_cvtsi128_si32(x4);
+#endif
+}
+#endif  // __SSE2__
+
+//! Compute the distance between matrix and query
+static __attribute__((always_inline)) void inner_product_int4_avx2(
+    const void *a, const void *b, size_t size, float *distance) {
+  const uint8_t *lhs = reinterpret_cast<const uint8_t *>(a);
+  const uint8_t *rhs = reinterpret_cast<const uint8_t *>(b);
+  const uint8_t *last = lhs + size;
+  const uint8_t *last_aligned = lhs + ((size >> 4) << 4);
+  __m128i xmm_sum = _mm_setzero_si128();
+
+  if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) {
+    for (; lhs != last_aligned; lhs += 16, rhs += 16) {
+      __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs));
+      __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs));
+      FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum)
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 16, rhs += 16) {
+      __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs));
+      __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs));
+      FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum)
+    }
+  }
+  float result = static_cast<float>(HorizontalAdd_INT32_V128(xmm_sum));
+
+  switch (last - lhs) {
+    case 15:
+      FMA_INT4_GENERAL(lhs[14], rhs[14], result)
+      /* FALLTHRU */
+    case 14:
+      FMA_INT4_GENERAL(lhs[13], rhs[13], result)
+      /* FALLTHRU */
+    case 13:
+      FMA_INT4_GENERAL(lhs[12], rhs[12], result)
+      /* FALLTHRU */
+    case 12:
+      FMA_INT4_GENERAL(lhs[11], rhs[11], result)
+      /* FALLTHRU */
+    case 11:
+      FMA_INT4_GENERAL(lhs[10], rhs[10], result)
+      /* FALLTHRU */
+    case 10:
+      FMA_INT4_GENERAL(lhs[9], rhs[9], result)
+      /* FALLTHRU */
+    case 9:
+      FMA_INT4_GENERAL(lhs[8], rhs[8], result)
+      /* FALLTHRU */
+    case 8:
+      FMA_INT4_GENERAL(lhs[7], rhs[7], result)
+      /* FALLTHRU */
+    case 7:
+      FMA_INT4_GENERAL(lhs[6], rhs[6], result)
+      /* FALLTHRU */
+    case 6:
+      FMA_INT4_GENERAL(lhs[5], rhs[5], result)
+      /* FALLTHRU */
+    case 5:
+      FMA_INT4_GENERAL(lhs[4], rhs[4], result)
+      /* FALLTHRU */
+    case 4:
+      FMA_INT4_GENERAL(lhs[3], rhs[3], result)
+      /* FALLTHRU */
+    case 3:
+      FMA_INT4_GENERAL(lhs[2], rhs[2], result)
+      /* FALLTHRU */
+    case 2:
+      FMA_INT4_GENERAL(lhs[1], rhs[1], result)
+      /* FALLTHRU */
+    case 1:
+      FMA_INT4_GENERAL(lhs[0], rhs[0], result)
+  }
+
+  *distance = result;
+}
+
+// Compute raw integer inner products for a batch of int8 vectors against a
+// single query. Uses AVX512-VNNI dpbusd instruction.
+// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8.
+template <size_t batch_size>
+__attribute__((always_inline)) void inner_product_int4_batch_avx2_impl(
+    const void *query, const void *const *vectors,
+    const std::array<const void *, batch_size> &prefetch_ptrs,
+    size_t dimensionality, float *distances) {}
+
+static __attribute__((always_inline)) void inner_product_int4_batch_avx2(
+    const void *const *vectors, const void *query, size_t n, size_t dim,
+    float *distances) {
+  static constexpr size_t batch_size = 2;
+  static constexpr size_t prefetch_step = 2;
+  size_t i = 0;
+  for (; i + batch_size <= n; i += batch_size) {
+    std::array<const void *, batch_size> prefetch_ptrs;
+    for (size_t j = 0; j < batch_size; ++j) {
+      if (i + j + batch_size * prefetch_step < n) {
+        prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step];
+      } else {
+        prefetch_ptrs[j] = nullptr;
+      }
+    }
+    inner_product_int4_batch_avx2_impl<batch_size>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+  for (; i < n; i++) {
+    std::array<const void *, 1> prefetch_ptrs{nullptr};
+    inner_product_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs,
+                                          dim, distances + i);
+  }
+}
+
+}  // namespace zvec::turbo::avx2::internal
+
+#endif  // defined(__AVX2__)
diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc
new file mode 100644
index 000000000..1599a722d
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc
@@ -0,0 +1,111 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx2/record_quantized_int4/squared_euclidean.h"
+#include "avx2/record_quantized_int4/inner_product_common.h"
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx2 {
+
+void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__AVX2__)
+  const int d = dim - 32;
+  const size_t original_dim = d >> 1;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_avx2(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+  float qs2 = a_tail[3];
+
+  const float sum = qa * qs;
+  const float sum2 = qa * qa * qs2;
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+  float ms2 = b_tail[3];
+
+  *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance +
+              (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX2__
+}
+
+void squared_euclidean_int4_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__AVX2__)
+  const int d = dim - 32;
+  const size_t original_dim = d >> 1;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_batch_avx2(vectors, query, n, original_dim,
+                                          distances);
+
+  const float *q_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(query) + original_dim);
+
+  float qa = q_tail[0];
+  float qb = q_tail[1];
+  float qs = q_tail[2];
+  float qs2 = q_tail[3];
+
+  const float sum = qa * qs;
+  const float sum2 = qa * qa * qs2;
+
+  for (int i = 0; i < n; ++i) {
+    const float *m_tail = reinterpret_cast<const float *>(
+        reinterpret_cast<const int8_t *>(vectors[i]) + original_dim);
+
+    float ma = m_tail[0];
+    float mb = m_tail[1];
+    float ms = m_tail[2];
+    float ms2 = m_tail[3];
+
+    float &result = distances[i];
+    result = ma * ma * ms2 + sum2 - 2 * ma * qa * result +
+             (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX2__
+}
+
+}  // namespace zvec::turbo::avx2
\ No newline at end of file
diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean.h b/src/turbo/avx2/record_quantized_int4/squared_euclidean.h
new file mode 100644
index 000000000..b6d15f698
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx2 {
+
+// Compute squared euclidean distance between a single quantized INT4
+// vector pair.
+void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean  INT4.
+void squared_euclidean_int4_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx2
diff --git a/src/turbo/avx2/record_quantized_int8/cosine.cc b/src/turbo/avx2/record_quantized_int8/cosine.cc
new file mode 100644
index 000000000..b31df0a13
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int8/cosine.cc
@@ -0,0 +1,69 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx2/record_quantized_int8/cosine.h"
+#include "avx2/record_quantized_int8/inner_product_common.h"
+#if defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx2 {
+
+void cosine_int8_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+#if defined(__AVX2__)
+  const int original_dim = dim - 24;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int8_avx2(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms +
+                static_cast<float>(original_dim) * qb * mb);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX2__
+}
+
+void cosine_int8_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__AVX2__)
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX2__
+}
+
+}  // namespace zvec::turbo::avx2
\ No newline at end of file
diff --git a/src/turbo/avx2/record_quantized_int8/cosine.h b/src/turbo/avx2/record_quantized_int8/cosine.h
new file mode 100644
index 000000000..6074ea428
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int8/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx2 {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized int8 vector pair.
+void cosine_int8_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_int8_distance.
+void cosine_int8_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx2
\ No newline at end of file
diff --git a/src/turbo/avx2/record_quantized_int8/inner_product.cc b/src/turbo/avx2/record_quantized_int8/inner_product.cc
new file mode 100644
index 000000000..4745c493a
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int8/inner_product.cc
@@ -0,0 +1,75 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx2/record_quantized_int8/inner_product.h"
+#include "avx2/record_quantized_int8/inner_product_common.h"
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx2 {
+
+// Compute squared Euclidean distance between a single quantized int8
+// vector pair.
+void inner_product_int8_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__AVX2__)
+  const size_t original_dim = dim - 20;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int8_avx2(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms +
+                original_dim * qb * mb);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  //__AVX2__
+}
+
+// Batch version of inner_product_int8_distance.
+void inner_product_int8_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__AVX2__)
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX2__
+}
+
+}  // namespace zvec::turbo::avx2
\ No newline at end of file
diff --git a/src/turbo/avx2/record_quantized_int8/inner_product.h b/src/turbo/avx2/record_quantized_int8/inner_product.h
new file mode 100644
index 000000000..249bafd00
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int8/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx2 {
+
+// Compute inner product distance between a single quantized int8
+// vector pair.
+void inner_product_int8_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_int8_distance.
+void inner_product_int8_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::avx2
diff --git a/src/turbo/avx2/record_quantized_int8/inner_product_common.h b/src/turbo/avx2/record_quantized_int8/inner_product_common.h
new file mode 100644
index 000000000..0176f277a
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int8/inner_product_common.h
@@ -0,0 +1,236 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+#include <zvec/ailego/internal/platform.h>
+
+namespace zvec::turbo::avx2::internal {
+
+#define ONES_INT16_SSE _mm_set1_epi32(0x00010001)
+#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001)
+
+//! Calculate Fused-Multiply-Add (GENERAL)
+#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast<float>(m * q);
+
+static inline int32_t HorizontalAdd_INT32_V256(__m256i v) {
+  __m256i x1 = _mm256_hadd_epi32(v, v);
+  __m256i x2 = _mm256_hadd_epi32(x1, x1);
+  __m128i x3 = _mm256_extractf128_si256(x2, 1);
+  __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3);
+  return _mm_cvtsi128_si32(x4);
+}
+
+static __attribute__((always_inline)) void inner_product_int8_avx2(
+    const void *a, const void *b, size_t size, float *distance) {
+  const int8_t *lhs = reinterpret_cast<const int8_t *>(a);
+  const int8_t *rhs = reinterpret_cast<const int8_t *>(b);
+
+  const int8_t *last = lhs + size;
+  const int8_t *last_aligned = lhs + ((size >> 6) << 6);
+  float result = 0.0;
+
+  __m256i ymm_sum_0 = _mm256_setzero_si256();
+  __m256i ymm_sum_1 = _mm256_setzero_si256();
+
+  if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) {
+    for (; lhs != last_aligned; lhs += 64, rhs += 64) {
+      __m256i ymm_lhs_0 = _mm256_load_si256((const __m256i *)(lhs + 0));
+      __m256i ymm_lhs_1 = _mm256_load_si256((const __m256i *)(lhs + 32));
+      __m256i ymm_rhs_0 = _mm256_load_si256((const __m256i *)(rhs + 0));
+      __m256i ymm_rhs_1 = _mm256_load_si256((const __m256i *)(rhs + 32));
+
+      ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0);
+      ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1);
+      ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0);
+      ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1);
+
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0),
+                            ONES_INT16_AVX),
+          ymm_sum_0);
+      ymm_sum_1 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1),
+                            ONES_INT16_AVX),
+          ymm_sum_1);
+    }
+
+    if (last >= last_aligned + 32) {
+      __m256i ymm_lhs = _mm256_load_si256((const __m256i *)lhs);
+      __m256i ymm_rhs = _mm256_load_si256((const __m256i *)rhs);
+      ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs);
+      ymm_rhs = _mm256_abs_epi8(ymm_rhs);
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs),
+                            ONES_INT16_AVX),
+          ymm_sum_0);
+      lhs += 32;
+      rhs += 32;
+    }
+
+    if (last >= lhs + 16) {
+      __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs);
+      __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs);
+      xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs);
+      xmm_rhs = _mm_abs_epi8(xmm_rhs);
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_set_m128i(_mm_setzero_si128(),
+                           _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs),
+                                          ONES_INT16_SSE)),
+          ymm_sum_0);
+      lhs += 16;
+      rhs += 16;
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 64, rhs += 64) {
+      __m256i ymm_lhs_0 = _mm256_loadu_si256((const __m256i *)(lhs + 0));
+      __m256i ymm_lhs_1 = _mm256_loadu_si256((const __m256i *)(lhs + 32));
+      __m256i ymm_rhs_0 = _mm256_loadu_si256((const __m256i *)(rhs + 0));
+      __m256i ymm_rhs_1 = _mm256_loadu_si256((const __m256i *)(rhs + 32));
+
+      ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0);
+      ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1);
+      ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0);
+      ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1);
+
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0),
+                            ONES_INT16_AVX),
+          ymm_sum_0);
+      ymm_sum_1 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1),
+                            ONES_INT16_AVX),
+          ymm_sum_1);
+    }
+
+    if (last >= last_aligned + 32) {
+      __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)lhs);
+      __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)rhs);
+      ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs);
+      ymm_rhs = _mm256_abs_epi8(ymm_rhs);
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs),
+                            ONES_INT16_AVX),
+          ymm_sum_0);
+      lhs += 32;
+      rhs += 32;
+    }
+
+    if (last >= lhs + 16) {
+      __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs);
+      __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs);
+      xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs);
+      xmm_rhs = _mm_abs_epi8(xmm_rhs);
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_set_m128i(_mm_setzero_si128(),
+                           _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs),
+                                          ONES_INT16_SSE)),
+          ymm_sum_0);
+      lhs += 16;
+      rhs += 16;
+    }
+  }
+  result = static_cast<float>(
+      HorizontalAdd_INT32_V256(_mm256_add_epi32(ymm_sum_0, ymm_sum_1)));
+
+  switch (last - lhs) {
+    case 15:
+      FMA_INT8_GENERAL(lhs[14], rhs[14], result)
+      /* FALLTHRU */
+    case 14:
+      FMA_INT8_GENERAL(lhs[13], rhs[13], result)
+      /* FALLTHRU */
+    case 13:
+      FMA_INT8_GENERAL(lhs[12], rhs[12], result)
+      /* FALLTHRU */
+    case 12:
+      FMA_INT8_GENERAL(lhs[11], rhs[11], result)
+      /* FALLTHRU */
+    case 11:
+      FMA_INT8_GENERAL(lhs[10], rhs[10], result)
+      /* FALLTHRU */
+    case 10:
+      FMA_INT8_GENERAL(lhs[9], rhs[9], result)
+      /* FALLTHRU */
+    case 9:
+      FMA_INT8_GENERAL(lhs[8], rhs[8], result)
+      /* FALLTHRU */
+    case 8:
+      FMA_INT8_GENERAL(lhs[7], rhs[7], result)
+      /* FALLTHRU */
+    case 7:
+      FMA_INT8_GENERAL(lhs[6], rhs[6], result)
+      /* FALLTHRU */
+    case 6:
+      FMA_INT8_GENERAL(lhs[5], rhs[5], result)
+      /* FALLTHRU */
+    case 5:
+      FMA_INT8_GENERAL(lhs[4], rhs[4], result)
+      /* FALLTHRU */
+    case 4:
+      FMA_INT8_GENERAL(lhs[3], rhs[3], result)
+      /* FALLTHRU */
+    case 3:
+      FMA_INT8_GENERAL(lhs[2], rhs[2], result)
+      /* FALLTHRU */
+    case 2:
+      FMA_INT8_GENERAL(lhs[1], rhs[1], result)
+      /* FALLTHRU */
+    case 1:
+      FMA_INT8_GENERAL(lhs[0], rhs[0], result)
+  }
+
+  *distance = result;
+}
+
+template <size_t batch_size>
+__attribute__((always_inline)) void inner_product_int8_batch_avx2_impl(
+    const void *query, const void *const *vectors,
+    const std::array<const void *, batch_size> &prefetch_ptrs,
+    size_t dimensionality, float *distances) {
+  // TBD
+}
+
+static __attribute__((always_inline)) void inner_product_int8_batch_avx2(
+    const void *const *vectors, const void *query, size_t n, size_t dim,
+    float *distances) {
+  static constexpr size_t batch_size = 2;
+  static constexpr size_t prefetch_step = 2;
+  size_t i = 0;
+  for (; i + batch_size <= n; i += batch_size) {
+    std::array<const void *, batch_size> prefetch_ptrs;
+    for (size_t j = 0; j < batch_size; ++j) {
+      if (i + j + batch_size * prefetch_step < n) {
+        prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step];
+      } else {
+        prefetch_ptrs[j] = nullptr;
+      }
+    }
+    inner_product_int8_batch_avx2_impl<batch_size>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+  for (; i < n; i++) {
+    std::array<const void *, 1> prefetch_ptrs{nullptr};
+    inner_product_int8_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs,
+                                          dim, distances + i);
+  }
+}
+
+}  // namespace zvec::turbo::avx2::internal
+
+#endif  // defined(__AVX2__)
diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc b/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc
new file mode 100644
index 000000000..0c3c71079
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc
@@ -0,0 +1,76 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx2/record_quantized_int8/squared_euclidean.h"
+#include "avx2/record_quantized_int8/inner_product_common.h"
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx2 {
+
+void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__AVX2__)
+  const int original_dim = dim - 20;
+  if (original_dim <= 0) {
+    return;
+  }
+  internal::inner_product_int8_avx2(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(b) + original_dim);
+
+  float ma = a_tail[0];
+  float mb = a_tail[1];
+  float ms = a_tail[2];
+  float ms2 = a_tail[3];
+
+  float qa = b_tail[0];
+  float qb = b_tail[1];
+  float qs = b_tail[2];
+  float qs2 = b_tail[3];
+
+  const float sum = qa * qs;
+  const float sum2 = qa * qa * qs2;
+
+  *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance +
+              (mb - qb) * (mb - qb) * original_dim +
+              2 * (mb - qb) * (ms * ma - sum);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX2__
+}
+
+void squared_euclidean_int8_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__AVX2__)
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX2__
+}
+
+}  // namespace zvec::turbo::avx2
\ No newline at end of file
diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean.h b/src/turbo/avx2/record_quantized_int8/squared_euclidean.h
new file mode 100644
index 000000000..1bbfa6676
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx2 {
+
+// Compute squared euclidean distance between a single quantized INT8
+// vector pair.
+void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean  INT8.
+void squared_euclidean_int8_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx2
diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h b/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h
new file mode 100644
index 000000000..e460ade68
--- /dev/null
+++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h
@@ -0,0 +1,250 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+#include <zvec/ailego/internal/platform.h>
+
+namespace zvec::turbo::avx2::internal {
+
+
+/*! Four-bits Integer Multiplication Table
+ */
+static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = {
+    0, 0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0, 1,  2,   3,   4,   5,   6,   7,   -8,  -7,  -6,  -5,  -4,  -3,  -2,  -1,
+    0, 2,  4,   6,   8,   10,  12,  14,  -16, -14, -12, -10, -8,  -6,  -4,  -2,
+    0, 3,  6,   9,   12,  15,  18,  21,  -24, -21, -18, -15, -12, -9,  -6,  -3,
+    0, 4,  8,   12,  16,  20,  24,  28,  -32, -28, -24, -20, -16, -12, -8,  -4,
+    0, 5,  10,  15,  20,  25,  30,  35,  -40, -35, -30, -25, -20, -15, -10, -5,
+    0, 6,  12,  18,  24,  30,  36,  42,  -48, -42, -36, -30, -24, -18, -12, -6,
+    0, 7,  14,  21,  28,  35,  42,  49,  -56, -49, -42, -35, -28, -21, -14, -7,
+    0, -8, -16, -24, -32, -40, -48, -56, 64,  56,  48,  40,  32,  24,  16,  8,
+    0, -7, -14, -21, -28, -35, -42, -49, 56,  49,  42,  35,  28,  21,  14,  7,
+    0, -6, -12, -18, -24, -30, -36, -42, 48,  42,  36,  30,  24,  18,  12,  6,
+    0, -5, -10, -15, -20, -25, -30, -35, 40,  35,  30,  25,  20,  15,  10,  5,
+    0, -4, -8,  -12, -16, -20, -24, -28, 32,  28,  24,  20,  16,  12,  8,   4,
+    0, -3, -6,  -9,  -12, -15, -18, -21, 24,  21,  18,  15,  12,  9,   6,   3,
+    0, -2, -4,  -6,  -8,  -10, -12, -14, 16,  14,  12,  10,  8,   6,   4,   2,
+    0, -1, -2,  -3,  -4,  -5,  -6,  -7,  8,   7,   6,   5,   4,   3,   2,   1,
+};
+
+//! Calculate Fused-Multiply-Add (GENERAL)
+#define FMA_INT4_GENERAL(m, q, sum)                               \
+  sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \
+         Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)];
+
+static inline int32_t HorizontalAdd_INT32_V256(__m256i v) {
+  __m256i x1 = _mm256_hadd_epi32(v, v);
+  __m256i x2 = _mm256_hadd_epi32(x1, x1);
+  __m128i x3 = _mm256_extractf128_si256(x2, 1);
+  __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3);
+  return _mm_cvtsi128_si32(x4);
+}
+
+#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f)
+#define ONES_INT16_SSE _mm_set1_epi32(0x00010001)
+
+#define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0)
+#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001)
+
+static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = {
+    0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1,
+    0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1};
+
+#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable)
+
+#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable)
+
+#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable)
+
+//! Compute the distance between matrix and query
+#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum)                       \
+  {                                                                        \
+    __m128i xmm_lhs_0 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE));         \
+    __m128i xmm_rhs_0 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE));         \
+    __m128i xmm_lhs_1 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE,                                                   \
+        _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE));       \
+    __m128i xmm_rhs_1 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE,                                                   \
+        _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE));       \
+    xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0);                       \
+    xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1);                       \
+    xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0);                                   \
+    xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1);                                   \
+    xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0),    \
+                               ONES_INT16_SSE);                            \
+    xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1),    \
+                               ONES_INT16_SSE);                            \
+    xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \
+  }
+
+#define FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum)                          \
+  {                                                                           \
+    __m256i ymm_lhs_0 = _mm256_shuffle_epi8(                                  \
+        INT4_LOOKUP_AVX, _mm256_and_si256((ymm_lhs), MASK_INT4_AVX));         \
+    __m256i ymm_rhs_0 = _mm256_shuffle_epi8(                                  \
+        INT4_LOOKUP_AVX, _mm256_and_si256((ymm_rhs), MASK_INT4_AVX));         \
+    __m256i ymm_lhs_1 = _mm256_shuffle_epi8(                                  \
+        INT4_LOOKUP_AVX,                                                      \
+        _mm256_and_si256(_mm256_srli_epi32((ymm_lhs), 4), MASK_INT4_AVX));    \
+    __m256i ymm_rhs_1 = _mm256_shuffle_epi8(                                  \
+        INT4_LOOKUP_AVX,                                                      \
+        _mm256_and_si256(_mm256_srli_epi32((ymm_rhs), 4), MASK_INT4_AVX));    \
+    ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0);                       \
+    ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1);                       \
+    ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0);                                   \
+    ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1);                                   \
+    ymm_lhs_0 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), \
+                                  ONES_INT16_AVX);                            \
+    ymm_lhs_1 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), \
+                                  ONES_INT16_AVX);                            \
+    ymm_sum =                                                                 \
+        _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum);    \
+  }
+
+#if defined(__SSE2__)
+static inline int32_t HorizontalAdd_INT32_V128(__m128i v) {
+#ifdef __SSE3__
+  __m128i x1 = _mm_hadd_epi32(v, v);
+  __m128i x2 = _mm_hadd_epi32(x1, x1);
+  return _mm_cvtsi128_si32(x2);
+#else
+  __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2));
+  __m128i x2 = _mm_add_epi32(v, x1);
+  __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1));
+  __m128i x4 = _mm_add_epi32(x2, x3);
+  return _mm_cvtsi128_si32(x4);
+#endif
+}
+#endif  // __SSE2__
+
+//! Compute the distance between matrix and query
+static __attribute__((always_inline)) void squared_euclidean_int4_avx2(
+    const void *a, const void *b, size_t size, float *distance) {
+  const uint8_t *lhs = reinterpret_cast<const uint8_t *>(a);
+  const uint8_t *rhs = reinterpret_cast<const uint8_t *>(b);
+  const uint8_t *last = lhs + size;
+  const uint8_t *last_aligned = lhs + ((size >> 4) << 4);
+  __m128i xmm_sum = _mm_setzero_si128();
+
+  if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) {
+    for (; lhs != last_aligned; lhs += 16, rhs += 16) {
+      __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs));
+      __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs));
+      FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum)
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 16, rhs += 16) {
+      __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs));
+      __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs));
+      FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum)
+    }
+  }
+  float result = static_cast<float>(HorizontalAdd_INT32_V128(xmm_sum));
+
+  switch (last - lhs) {
+    case 15:
+      FMA_INT4_GENERAL(lhs[14], rhs[14], result)
+      /* FALLTHRU */
+    case 14:
+      FMA_INT4_GENERAL(lhs[13], rhs[13], result)
+      /* FALLTHRU */
+    case 13:
+      FMA_INT4_GENERAL(lhs[12], rhs[12], result)
+      /* FALLTHRU */
+    case 12:
+      FMA_INT4_GENERAL(lhs[11], rhs[11], result)
+      /* FALLTHRU */
+    case 11:
+      FMA_INT4_GENERAL(lhs[10], rhs[10], result)
+      /* FALLTHRU */
+    case 10:
+      FMA_INT4_GENERAL(lhs[9], rhs[9], result)
+      /* FALLTHRU */
+    case 9:
+      FMA_INT4_GENERAL(lhs[8], rhs[8], result)
+      /* FALLTHRU */
+    case 8:
+      FMA_INT4_GENERAL(lhs[7], rhs[7], result)
+      /* FALLTHRU */
+    case 7:
+      FMA_INT4_GENERAL(lhs[6], rhs[6], result)
+      /* FALLTHRU */
+    case 6:
+      FMA_INT4_GENERAL(lhs[5], rhs[5], result)
+      /* FALLTHRU */
+    case 5:
+      FMA_INT4_GENERAL(lhs[4], rhs[4], result)
+      /* FALLTHRU */
+    case 4:
+      FMA_INT4_GENERAL(lhs[3], rhs[3], result)
+      /* FALLTHRU */
+    case 3:
+      FMA_INT4_GENERAL(lhs[2], rhs[2], result)
+      /* FALLTHRU */
+    case 2:
+      FMA_INT4_GENERAL(lhs[1], rhs[1], result)
+      /* FALLTHRU */
+    case 1:
+      FMA_INT4_GENERAL(lhs[0], rhs[0], result)
+  }
+
+  *distance = result;
+}
+
+// Compute raw integer inner products for a batch of int8 vectors against a
+// single query. Uses AVX512-VNNI dpbusd instruction.
+// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8.
+template <size_t batch_size>
+__attribute__((always_inline)) void inner_product_int4_batch_avx2_impl(
+    const void *query, const void *const *vectors,
+    const std::array<const void *, batch_size> &prefetch_ptrs,
+    size_t dimensionality, float *distances) {}
+
+static __attribute__((always_inline)) void inner_product_int4_batch_avx2(
+    const void *const *vectors, const void *query, size_t n, size_t dim,
+    float *distances) {
+  static constexpr size_t batch_size = 2;
+  static constexpr size_t prefetch_step = 2;
+  size_t i = 0;
+  for (; i + batch_size <= n; i += batch_size) {
+    std::array<const void *, batch_size> prefetch_ptrs;
+    for (size_t j = 0; j < batch_size; ++j) {
+      if (i + j + batch_size * prefetch_step < n) {
+        prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step];
+      } else {
+        prefetch_ptrs[j] = nullptr;
+      }
+    }
+    inner_product_int4_batch_avx2_impl<batch_size>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+  for (; i < n; i++) {
+    std::array<const void *, 1> prefetch_ptrs{nullptr};
+    inner_product_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs,
+                                          dim, distances + i);
+  }
+}
+
+}  // namespace zvec::turbo::avx2::internal
+
+#endif  // defined(__AVX2__)
diff --git a/src/turbo/avx512/float32/common.h b/src/turbo/avx512/float32/common.h
new file mode 100644
index 000000000..af04d0e41
--- /dev/null
+++ b/src/turbo/avx512/float32/common.h
@@ -0,0 +1,42 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+
+//! Calculate Fused-Multiply-Add (AVX512)
+#define FMA_FP32_AVX512(zmm_m, zmm_q, zmm_sum) \
+  zmm_sum = _mm512_fmadd_ps(zmm_m, zmm_q, zmm_sum);
+
+
+static inline float HorizontalAdd_FP32_V256(__m256 v) {
+  __m256 x1 = _mm256_hadd_ps(v, v);
+  __m256 x2 = _mm256_hadd_ps(x1, x1);
+  __m128 x3 = _mm256_extractf128_ps(x2, 1);
+  __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3);
+  return _mm_cvtss_f32(x4);
+}
+
+static inline float HorizontalAdd_FP32_V512(__m512 v) {
+  __m256 low = _mm512_castps512_ps256(v);
+  __m256 high =
+      _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1));
+  return HorizontalAdd_FP32_V256(_mm256_add_ps(low, high));
+}
+
+#endif  // __AVX512F__
\ No newline at end of file
diff --git a/src/turbo/avx512/float32/cosine.cc b/src/turbo/avx512/float32/cosine.cc
new file mode 100644
index 000000000..3fff482c4
--- /dev/null
+++ b/src/turbo/avx512/float32/cosine.cc
@@ -0,0 +1,67 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx512/float32/cosine.h"
+#include "avx512/float32/common.h"
+#include "avx512/float32/inner_product.h"
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx512 {
+
+void cosine_fp32_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+#if defined(__AVX512F__)
+  constexpr size_t extra_dim = 1;
+  size_t d = dim - extra_dim;
+
+  float ip;
+  inner_product_fp32_distance(a, b, d, &ip);
+
+  *distance = 1 - ip;
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX512F__
+}
+
+void cosine_fp32_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__AVX512F__)
+  // `dim` is the full encoded size; the original vector occupies dim-24 bytes.
+  const int original_dim = dim - 1;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  inner_product_fp32_batch_distance(vectors, query, n, original_dim, distances);
+
+  for (size_t i = 0; i < n; ++i) {
+    distances[i] = 1 - distances[i];
+  }
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX512F__
+}
+
+}  // namespace zvec::turbo::avx512
\ No newline at end of file
diff --git a/src/turbo/avx512/float32/cosine.h b/src/turbo/avx512/float32/cosine.h
new file mode 100644
index 000000000..7e11de89f
--- /dev/null
+++ b/src/turbo/avx512/float32/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx512 {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized FP32 vector pair.
+void cosine_fp32_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_fp32_distance.
+void cosine_fp32_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx512
\ No newline at end of file
diff --git a/src/turbo/avx512/float32/inner_product.cc b/src/turbo/avx512/float32/inner_product.cc
new file mode 100644
index 000000000..b28ef2e6a
--- /dev/null
+++ b/src/turbo/avx512/float32/inner_product.cc
@@ -0,0 +1,104 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx512/float32/inner_product.h"
+#include "avx512/float32/common.h"
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx512 {
+
+// Compute squared Euclidean distance between a single quantized FP32
+// vector pair.
+void inner_product_fp32_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__AVX512F__)
+  const float *lhs = reinterpret_cast<const float *>(a);
+  const float *rhs = reinterpret_cast<const float *>(b);
+
+  const float *last = lhs + dim;
+  const float *last_aligned = lhs + ((dim >> 5) << 5);
+
+  __m512 zmm_sum_0 = _mm512_setzero_ps();
+  __m512 zmm_sum_1 = _mm512_setzero_ps();
+
+  if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) {
+    for (; lhs != last_aligned; lhs += 32, rhs += 32) {
+      FMA_FP32_AVX512(_mm512_load_ps(lhs + 0), _mm512_load_ps(rhs + 0),
+                      zmm_sum_0)
+
+      FMA_FP32_AVX512(_mm512_load_ps(lhs + 16), _mm512_load_ps(rhs + 16),
+                      zmm_sum_1)
+    }
+
+    if (last >= last_aligned + 16) {
+      FMA_FP32_AVX512(_mm512_load_ps(lhs), _mm512_load_ps(rhs), zmm_sum_0)
+      lhs += 16;
+      rhs += 16;
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 32, rhs += 32) {
+      FMA_FP32_AVX512(_mm512_loadu_ps(lhs + 0), _mm512_loadu_ps(rhs + 0),
+                      zmm_sum_0)
+
+      FMA_FP32_AVX512(_mm512_loadu_ps(lhs + 16), _mm512_loadu_ps(rhs + 16),
+                      zmm_sum_1)
+    }
+
+    if (last >= last_aligned + 16) {
+      FMA_FP32_AVX512(_mm512_loadu_ps(lhs), _mm512_loadu_ps(rhs), zmm_sum_0)
+      lhs += 16;
+      rhs += 16;
+    }
+  }
+
+  zmm_sum_0 = _mm512_add_ps(zmm_sum_0, zmm_sum_1);
+  if (lhs != last) {
+    __mmask16 mask = (__mmask16)((1 << (last - lhs)) - 1);
+    __m512 zmm_undefined = _mm512_undefined_ps();
+    zmm_sum_0 = _mm512_mask3_fmadd_ps(
+        _mm512_mask_loadu_ps(zmm_undefined, mask, lhs),
+        _mm512_mask_loadu_ps(zmm_undefined, mask, rhs), zmm_sum_0, mask);
+  }
+
+  *distance = -1 * HorizontalAdd_FP32_V512(zmm_sum_0);
+
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  //__AVX512F__
+}
+
+// Batch version of inner_product_fp32_distance.
+void inner_product_fp32_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__AVX512F__)
+  for (size_t i = 0; i < n; ++i) {
+    inner_product_fp32_distance(vectors[i], query, dim, &distances[i]);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX512F__
+}
+
+}  // namespace zvec::turbo::avx512
\ No newline at end of file
diff --git a/src/turbo/avx512/float32/inner_product.h b/src/turbo/avx512/float32/inner_product.h
new file mode 100644
index 000000000..d1f48eecf
--- /dev/null
+++ b/src/turbo/avx512/float32/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx512 {
+
+// Compute inner product distance between a single quantized FP32
+// vector pair.
+void inner_product_fp32_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_fp32_distance.
+void inner_product_fp32_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::avx512
diff --git a/src/turbo/avx512/float32/squared_euclidean.cc b/src/turbo/avx512/float32/squared_euclidean.cc
new file mode 100644
index 000000000..cc00cacf9
--- /dev/null
+++ b/src/turbo/avx512/float32/squared_euclidean.cc
@@ -0,0 +1,105 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx512/float32/squared_euclidean.h"
+#include "avx512/float32/common.h"
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx512 {
+
+void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__AVX512F__)
+  const float *lhs = reinterpret_cast<const float *>(a);
+  const float *rhs = reinterpret_cast<const float *>(b);
+
+  const float *last = lhs + dim;
+  const float *last_aligned = lhs + ((dim >> 5) << 5);
+
+  __m512 zmm_sum_0 = _mm512_setzero_ps();
+  __m512 zmm_sum_1 = _mm512_setzero_ps();
+
+  if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) {
+    for (; lhs != last_aligned; lhs += 32, rhs += 32) {
+      __m512 zmm_d_0 =
+          _mm512_sub_ps(_mm512_load_ps(lhs + 0), _mm512_load_ps(rhs + 0));
+      __m512 zmm_d_1 =
+          _mm512_sub_ps(_mm512_load_ps(lhs + 16), _mm512_load_ps(rhs + 16));
+      zmm_sum_0 = _mm512_fmadd_ps(zmm_d_0, zmm_d_0, zmm_sum_0);
+      zmm_sum_1 = _mm512_fmadd_ps(zmm_d_1, zmm_d_1, zmm_sum_1);
+    }
+
+    if (last >= last_aligned + 16) {
+      __m512 zmm_d = _mm512_sub_ps(_mm512_load_ps(lhs), _mm512_load_ps(rhs));
+      zmm_sum_0 = _mm512_fmadd_ps(zmm_d, zmm_d, zmm_sum_0);
+      lhs += 16;
+      rhs += 16;
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 32, rhs += 32) {
+      __m512 zmm_d_0 =
+          _mm512_sub_ps(_mm512_loadu_ps(lhs + 0), _mm512_loadu_ps(rhs + 0));
+      __m512 zmm_d_1 =
+          _mm512_sub_ps(_mm512_loadu_ps(lhs + 16), _mm512_loadu_ps(rhs + 16));
+      zmm_sum_0 = _mm512_fmadd_ps(zmm_d_0, zmm_d_0, zmm_sum_0);
+      zmm_sum_1 = _mm512_fmadd_ps(zmm_d_1, zmm_d_1, zmm_sum_1);
+    }
+
+    if (last >= last_aligned + 16) {
+      __m512 zmm_d = _mm512_sub_ps(_mm512_loadu_ps(lhs), _mm512_loadu_ps(rhs));
+      zmm_sum_0 = _mm512_fmadd_ps(zmm_d, zmm_d, zmm_sum_0);
+      lhs += 16;
+      rhs += 16;
+    }
+  }
+
+  zmm_sum_0 = _mm512_add_ps(zmm_sum_0, zmm_sum_1);
+  if (lhs != last) {
+    __mmask16 mask = (__mmask16)((1 << (last - lhs)) - 1);
+    __m512 zmm_undefined = _mm512_undefined_ps();
+    __m512 zmm_d = _mm512_mask_sub_ps(
+        zmm_undefined, mask, _mm512_mask_loadu_ps(zmm_undefined, mask, lhs),
+        _mm512_mask_loadu_ps(zmm_undefined, mask, rhs));
+    zmm_sum_0 = _mm512_mask3_fmadd_ps(zmm_d, zmm_d, zmm_sum_0, mask);
+  }
+
+  *distance = HorizontalAdd_FP32_V512(zmm_sum_0);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX512F__
+}
+
+void squared_euclidean_fp32_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__AVX512F__)
+  for (size_t i = 0; i < n; ++i) {
+    squared_euclidean_fp32_distance(vectors[i], query, dim, &distances[i]);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX512F__
+}
+
+}  // namespace zvec::turbo::avx512
\ No newline at end of file
diff --git a/src/turbo/avx512/float32/squared_euclidean.h b/src/turbo/avx512/float32/squared_euclidean.h
new file mode 100644
index 000000000..8b43b540e
--- /dev/null
+++ b/src/turbo/avx512/float32/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx512 {
+
+// Compute squared euclidean distance between a single quantized FP32
+// vector pair.
+void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean FP32.
+void squared_euclidean_fp32_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx512
diff --git a/src/turbo/avx512/half_float/cosine.cc b/src/turbo/avx512/half_float/cosine.cc
new file mode 100644
index 000000000..bf08eb744
--- /dev/null
+++ b/src/turbo/avx512/half_float/cosine.cc
@@ -0,0 +1,66 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx512/half_float/cosine.h"
+#include "avx512/half_float/inner_product.h"
+#include "avx512/half_float/inner_product_common.h"
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx512 {
+
+void cosine_fp16_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+#if defined(__AVX512F__)
+  constexpr size_t extra_dim = 2;
+  size_t original_dim = dim - extra_dim;
+
+  float ip;
+  inner_product_fp16_distance(a, b, original_dim, &ip);
+
+  *distance = 1 - ip;
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX512F__
+}
+
+void cosine_fp16_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__AVX512F__)
+  constexpr size_t extra_dim = 2;
+  const size_t original_dim = dim - extra_dim;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  inner_product_fp16_batch_distance(vectors, query, n, original_dim, distances);
+
+  for (size_t i = 0; i < n; ++i) {
+    distances[i] = 1 - distances[i];
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX512F__
+}
+
+}  // namespace zvec::turbo::avx512
\ No newline at end of file
diff --git a/src/turbo/avx512/half_float/cosine.h b/src/turbo/avx512/half_float/cosine.h
new file mode 100644
index 000000000..1e068dd6e
--- /dev/null
+++ b/src/turbo/avx512/half_float/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx512 {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized FP32 vector pair.
+void cosine_fp16_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_fp32_distance.
+void cosine_fp16_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx512
\ No newline at end of file
diff --git a/src/turbo/avx512/half_float/inner_product.cc b/src/turbo/avx512/half_float/inner_product.cc
new file mode 100644
index 000000000..221d0a2ab
--- /dev/null
+++ b/src/turbo/avx512/half_float/inner_product.cc
@@ -0,0 +1,59 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+#include <zvec/ailego/utility/float_helper.h>
+#include "avx512/half_float/inner_product.h"
+#include "avx512/half_float/inner_product_common.h"
+
+using namespace zvec::turbo::avx512::internal;
+#endif
+
+namespace zvec::turbo::avx512 {
+
+// Compute squared Euclidean distance between a single quantized FP16
+// vector pair.
+void inner_product_fp16_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__AVX512F__)
+  const zvec::ailego::Float16 *lhs =
+      reinterpret_cast<const zvec::ailego::Float16 *>(a);
+  const zvec::ailego::Float16 *rhs =
+      reinterpret_cast<const zvec::ailego::Float16 *>(b);
+
+  ACCUM_FP16_1X1_AVX512(lhs, rhs, dim, distance, 0ull, NEGATE_FP32_GENERAL)
+#endif
+}
+
+// Batch version of inner_product_fp16_distance.
+void inner_product_fp16_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__AVX512F__)
+  for (size_t i = 0; i < n; ++i) {
+    inner_product_fp16_distance(vectors[i], query, dim, &distances[i]);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif
+}
+
+}  // namespace zvec::turbo::avx512
\ No newline at end of file
diff --git a/src/turbo/avx512/half_float/inner_product.h b/src/turbo/avx512/half_float/inner_product.h
new file mode 100644
index 000000000..833d4c8c3
--- /dev/null
+++ b/src/turbo/avx512/half_float/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx512 {
+
+// Compute inner product distance between a single quantized FP16
+// vector pair.
+void inner_product_fp16_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_fp32_distance.
+void inner_product_fp16_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::avx512
diff --git a/src/turbo/avx512/half_float/inner_product_common.h b/src/turbo/avx512/half_float/inner_product_common.h
new file mode 100644
index 000000000..dcd6f2a83
--- /dev/null
+++ b/src/turbo/avx512/half_float/inner_product_common.h
@@ -0,0 +1,209 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+#include <zvec/ailego/utility/float_helper.h>
+
+using namespace zvec::ailego;
+
+namespace zvec::turbo::avx512::internal {
+//! Reverse sign of value (GENERAL)
+#define NEGATE_FP32_GENERAL(v) -(v)
+
+static inline float HorizontalAdd_FP32_V256(__m256 v) {
+  __m256 x1 = _mm256_hadd_ps(v, v);
+  __m256 x2 = _mm256_hadd_ps(x1, x1);
+  __m128 x3 = _mm256_extractf128_ps(x2, 1);
+  __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3);
+  return _mm_cvtss_f32(x4);
+}
+
+//! Iterative process of computing distance (FP16, M=1, N=1)
+#define MATRIX_FP16_ITER_1X1_AVX512(m, q, _RES, _LOAD, _PROC)       \
+  {                                                                 \
+    __m512i zmm_mi = _LOAD((const __m512i *)m);                     \
+    __m512i zmm_qi = _LOAD((const __m512i *)q);                     \
+    __m512 zmm_m = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_mi)); \
+    __m512 zmm_q = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_qi)); \
+    _PROC(zmm_m, zmm_q, _RES##_0_0);                                \
+    zmm_m = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_mi, 1));  \
+    zmm_q = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_qi, 1));  \
+    _PROC(zmm_m, zmm_q, _RES##_0_0);                                \
+  }
+
+//! Mask process of computing distance (FP16)
+#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC)              \
+  switch (cnt) {                                                             \
+    case 7: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), *((const short *)(lhs) + 6),                       \
+          *((const short *)(lhs) + 5), *((const short *)(lhs) + 4),          \
+          *((const short *)(lhs) + 3), *((const short *)(lhs) + 2),          \
+          *((const short *)(lhs) + 1), *((const short *)(lhs))));            \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), *((const short *)(rhs) + 6),                       \
+          *((const short *)(rhs) + 5), *((const short *)(rhs) + 4),          \
+          *((const short *)(rhs) + 3), *((const short *)(rhs) + 2),          \
+          *((const short *)(rhs) + 1), *((const short *)(rhs))));            \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 6: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2),             \
+                        *((const int *)(lhs) + 1), *((const int *)(lhs))));  \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2),             \
+                        *((const int *)(rhs) + 1), *((const int *)(rhs))));  \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 5: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK),                    \
+          *((const short *)(lhs) + 4), *((const short *)(lhs) + 3),          \
+          *((const short *)(lhs) + 2), *((const short *)(lhs) + 1),          \
+          *((const short *)(lhs))));                                         \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK),                    \
+          *((const short *)(rhs) + 4), *((const short *)(rhs) + 3),          \
+          *((const short *)(rhs) + 2), *((const short *)(rhs) + 1),          \
+          *((const short *)(rhs))));                                         \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 4: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs))));           \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs))));           \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 3: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK),    \
+          (short)(_MASK), *((const short *)(lhs) + 2),                       \
+          *((const short *)(lhs) + 1), *((const short *)(lhs))));            \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK),    \
+          (short)(_MASK), *((const short *)(rhs) + 2),                       \
+          *((const short *)(rhs) + 1), *((const short *)(rhs))));            \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 2: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32(                        \
+          (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32(                        \
+          (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 1: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK),             \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK),      \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK)));    \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK),             \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK),      \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK)));    \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+  }
+
+//! Calculate Fused-Multiply-Add (AVX)
+#define FMA_FP32_AVX(ymm_m, ymm_q, ymm_sum) \
+  ymm_sum = _mm256_fmadd_ps(ymm_m, ymm_q, ymm_sum);
+
+#define ACCUM_FP32_STEP_AVX FMA_FP32_AVX
+
+//! Calculate Fused-Multiply-Add (AVX512)
+#define FMA_FP32_AVX512(zmm_m, zmm_q, zmm_sum) \
+  zmm_sum = _mm512_fmadd_ps(zmm_m, zmm_q, zmm_sum);
+
+#define ACCUM_FP32_STEP_AVX512 FMA_FP32_AVX512
+
+#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT);
+
+#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT)
+
+//! Iterative process of computing distance (FP16, M=1, N=1)
+#define MATRIX_FP16_ITER_1X1_AVX(m, q, _RES, _LOAD, _PROC)          \
+  {                                                                 \
+    __m256i ymm_mi = _LOAD((const __m256i *)m);                     \
+    __m256i ymm_qi = _LOAD((const __m256i *)q);                     \
+    __m256 ymm_m = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_mi)); \
+    __m256 ymm_q = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_qi)); \
+    _PROC(ymm_m, ymm_q, _RES##_0_0);                                \
+    ymm_m = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_mi, 1));   \
+    ymm_q = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_qi, 1));   \
+    _PROC(ymm_m, ymm_q, _RES##_0_0);                                \
+  }
+
+//! Compute the distance between matrix and query (FP16, M=1, N=1)
+#define ACCUM_FP16_1X1_AVX512(m, q, dim, out, _MASK, _NORM)                   \
+  MATRIX_VAR_INIT(1, 1, __m512, zmm_sum, _mm512_setzero_ps())                 \
+  const Float16 *qe = q + dim;                                                \
+  const Float16 *qe_aligned = q + ((dim >> 5) << 5);                          \
+  if (((uintptr_t)m & 0x3f) == 0 && ((uintptr_t)q & 0x3f) == 0) {             \
+    for (; q != qe_aligned; m += 32, q += 32) {                               \
+      MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_load_si512,           \
+                                  ACCUM_FP32_STEP_AVX512)                     \
+    }                                                                         \
+    if (qe >= qe_aligned + 16) {                                              \
+      __m512 zmm_m = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)m));  \
+      __m512 zmm_q = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)q));  \
+      ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0)                       \
+      m += 16;                                                                \
+      q += 16;                                                                \
+    }                                                                         \
+  } else {                                                                    \
+    for (; q != qe_aligned; m += 32, q += 32) {                               \
+      MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_loadu_si512,          \
+                                  ACCUM_FP32_STEP_AVX512)                     \
+    }                                                                         \
+    if (qe >= qe_aligned + 16) {                                              \
+      __m512 zmm_m = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)m)); \
+      __m512 zmm_q = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)q)); \
+      ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0)                       \
+      m += 16;                                                                \
+      q += 16;                                                                \
+    }                                                                         \
+  }                                                                           \
+  __m256 ymm_sum_0_0 = _mm256_add_ps(_mm512_castps512_ps256(zmm_sum_0_0),     \
+                                     _mm256_castpd_ps(_mm512_extractf64x4_pd( \
+                                         _mm512_castps_pd(zmm_sum_0_0), 1))); \
+  if (qe >= q + 8) {                                                          \
+    __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m));      \
+    __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q));      \
+    ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0)                            \
+    m += 8;                                                                   \
+    q += 8;                                                                   \
+  }                                                                           \
+  MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX)   \
+  *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0));
+
+}  // namespace zvec::turbo::avx512::internal
+
+#endif  // defined(__AVX512F__)
diff --git a/src/turbo/avx512/half_float/squared_euclidean.cc b/src/turbo/avx512/half_float/squared_euclidean.cc
new file mode 100644
index 000000000..7a4b18e11
--- /dev/null
+++ b/src/turbo/avx512/half_float/squared_euclidean.cc
@@ -0,0 +1,61 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+#include <zvec/ailego/utility/float_helper.h>
+#include "avx512/half_float/squared_euclidean.h"
+#include "avx512/half_float/squared_euclidean_common.h"
+
+using namespace zvec::turbo::avx512::internal;
+#endif
+
+namespace zvec::turbo::avx512 {
+
+void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__AVX512F__)
+  const zvec::ailego::Float16 *lhs =
+      reinterpret_cast<const zvec::ailego::Float16 *>(a);
+  const zvec::ailego::Float16 *rhs =
+      reinterpret_cast<const zvec::ailego::Float16 *>(b);
+
+  ACCUM_FP16_1X1_AVX512(lhs, rhs, dim, distance, 0ull, )
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX512F__
+}
+
+void squared_euclidean_fp16_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__AVX512F__)
+  for (size_t i = 0; i < n; ++i) {
+    squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX512F__
+}
+
+}  // namespace zvec::turbo::avx512
\ No newline at end of file
diff --git a/src/turbo/avx512/half_float/squared_euclidean.h b/src/turbo/avx512/half_float/squared_euclidean.h
new file mode 100644
index 000000000..399e238b0
--- /dev/null
+++ b/src/turbo/avx512/half_float/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx512 {
+
+// Compute squared euclidean distance between a single quantized FP32
+// vector pair.
+void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean FP32.
+void squared_euclidean_fp16_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx512
diff --git a/src/turbo/avx512/half_float/squared_euclidean_common.h b/src/turbo/avx512/half_float/squared_euclidean_common.h
new file mode 100644
index 000000000..6ff8c4254
--- /dev/null
+++ b/src/turbo/avx512/half_float/squared_euclidean_common.h
@@ -0,0 +1,200 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+#include <zvec/ailego/utility/float_helper.h>
+
+using namespace zvec::ailego;
+
+namespace zvec::turbo::avx512::internal {
+
+static inline float HorizontalAdd_FP32_V256(__m256 v) {
+  __m256 x1 = _mm256_hadd_ps(v, v);
+  __m256 x2 = _mm256_hadd_ps(x1, x1);
+  __m128 x3 = _mm256_extractf128_ps(x2, 1);
+  __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3);
+  return _mm_cvtss_f32(x4);
+}
+
+//! Iterative process of computing distance (FP16, M=1, N=1)
+#define MATRIX_FP16_ITER_1X1_AVX512(m, q, _RES, _LOAD, _PROC)       \
+  {                                                                 \
+    __m512i zmm_mi = _LOAD((const __m512i *)m);                     \
+    __m512i zmm_qi = _LOAD((const __m512i *)q);                     \
+    __m512 zmm_m = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_mi)); \
+    __m512 zmm_q = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_qi)); \
+    _PROC(zmm_m, zmm_q, _RES##_0_0);                                \
+    zmm_m = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_mi, 1));  \
+    zmm_q = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_qi, 1));  \
+    _PROC(zmm_m, zmm_q, _RES##_0_0);                                \
+  }
+
+//! Mask process of computing distance (FP16)
+#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC)              \
+  switch (cnt) {                                                             \
+    case 7: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), *((const short *)(lhs) + 6),                       \
+          *((const short *)(lhs) + 5), *((const short *)(lhs) + 4),          \
+          *((const short *)(lhs) + 3), *((const short *)(lhs) + 2),          \
+          *((const short *)(lhs) + 1), *((const short *)(lhs))));            \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), *((const short *)(rhs) + 6),                       \
+          *((const short *)(rhs) + 5), *((const short *)(rhs) + 4),          \
+          *((const short *)(rhs) + 3), *((const short *)(rhs) + 2),          \
+          *((const short *)(rhs) + 1), *((const short *)(rhs))));            \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 6: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2),             \
+                        *((const int *)(lhs) + 1), *((const int *)(lhs))));  \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2),             \
+                        *((const int *)(rhs) + 1), *((const int *)(rhs))));  \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 5: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK),                    \
+          *((const short *)(lhs) + 4), *((const short *)(lhs) + 3),          \
+          *((const short *)(lhs) + 2), *((const short *)(lhs) + 1),          \
+          *((const short *)(lhs))));                                         \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK),                    \
+          *((const short *)(rhs) + 4), *((const short *)(rhs) + 3),          \
+          *((const short *)(rhs) + 2), *((const short *)(rhs) + 1),          \
+          *((const short *)(rhs))));                                         \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 4: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs))));           \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs))));           \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 3: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK),    \
+          (short)(_MASK), *((const short *)(lhs) + 2),                       \
+          *((const short *)(lhs) + 1), *((const short *)(lhs))));            \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16(                        \
+          (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK),    \
+          (short)(_MASK), *((const short *)(rhs) + 2),                       \
+          *((const short *)(rhs) + 1), *((const short *)(rhs))));            \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 2: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32(                        \
+          (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \
+      __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32(                        \
+          (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+    case 1: {                                                                \
+      __m256 ymm_lhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK),             \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK),      \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK)));    \
+      __m256 ymm_rhs = _mm256_cvtph_ps(                                      \
+          _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK),             \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK),      \
+                        (short)(_MASK), (short)(_MASK), (short)(_MASK)));    \
+      _PROC(ymm_lhs, ymm_rhs, _RES##_0_0)                                    \
+      break;                                                                 \
+    }                                                                        \
+  }
+
+//! Calculate sum of squared difference (AVX)
+#define SSD_FP32_AVX(ymm_m, ymm_q, ymm_sum)           \
+  {                                                   \
+    __m256 ymm_d = _mm256_sub_ps(ymm_m, ymm_q);       \
+    ymm_sum = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum); \
+  }
+
+#define ACCUM_FP32_STEP_AVX SSD_FP32_AVX
+
+//! Calculate sum of squared difference (AVX512)
+#define SSD_FP32_AVX512(zmm_m, zmm_q, zmm_sum)        \
+  {                                                   \
+    __m512 zmm_d = _mm512_sub_ps(zmm_m, zmm_q);       \
+    zmm_sum = _mm512_fmadd_ps(zmm_d, zmm_d, zmm_sum); \
+  }
+
+#define ACCUM_FP32_STEP_AVX512 SSD_FP32_AVX512
+
+#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT);
+
+#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \
+  MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT)
+
+//! Compute the distance between matrix and query (FP16, M=1, N=1)
+#define ACCUM_FP16_1X1_AVX512(m, q, dim, out, _MASK, _NORM)                   \
+  MATRIX_VAR_INIT(1, 1, __m512, zmm_sum, _mm512_setzero_ps())                 \
+  const Float16 *qe = q + dim;                                                \
+  const Float16 *qe_aligned = q + ((dim >> 5) << 5);                          \
+  if (((uintptr_t)m & 0x3f) == 0 && ((uintptr_t)q & 0x3f) == 0) {             \
+    for (; q != qe_aligned; m += 32, q += 32) {                               \
+      MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_load_si512,           \
+                                  ACCUM_FP32_STEP_AVX512)                     \
+    }                                                                         \
+    if (qe >= qe_aligned + 16) {                                              \
+      __m512 zmm_m = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)m));  \
+      __m512 zmm_q = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)q));  \
+      ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0)                       \
+      m += 16;                                                                \
+      q += 16;                                                                \
+    }                                                                         \
+  } else {                                                                    \
+    for (; q != qe_aligned; m += 32, q += 32) {                               \
+      MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_loadu_si512,          \
+                                  ACCUM_FP32_STEP_AVX512)                     \
+    }                                                                         \
+    if (qe >= qe_aligned + 16) {                                              \
+      __m512 zmm_m = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)m)); \
+      __m512 zmm_q = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)q)); \
+      ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0)                       \
+      m += 16;                                                                \
+      q += 16;                                                                \
+    }                                                                         \
+  }                                                                           \
+  __m256 ymm_sum_0_0 = _mm256_add_ps(_mm512_castps512_ps256(zmm_sum_0_0),     \
+                                     _mm256_castpd_ps(_mm512_extractf64x4_pd( \
+                                         _mm512_castps_pd(zmm_sum_0_0), 1))); \
+  if (qe >= q + 8) {                                                          \
+    __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m));      \
+    __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q));      \
+    ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0)                            \
+    m += 8;                                                                   \
+    q += 8;                                                                   \
+  }                                                                           \
+  MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX)   \
+  *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0));
+
+}  // namespace zvec::turbo::avx512::internal
+
+#endif  // defined(__AVX512F__)
diff --git a/src/turbo/avx512_fp16/half_float/cosine.cc b/src/turbo/avx512_fp16/half_float/cosine.cc
new file mode 100644
index 000000000..a5404712a
--- /dev/null
+++ b/src/turbo/avx512_fp16/half_float/cosine.cc
@@ -0,0 +1,66 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx512_fp16/half_float/cosine.h"
+#include "avx512_fp16/half_float/inner_product.h"
+#include "avx512_fp16/half_float/inner_product_common.h"
+
+#if defined(__AVX512FP16__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::avx512_fp16 {
+
+void cosine_fp16_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+#if defined(__AVX512FP16__)
+  constexpr size_t extra_dim = 2;
+  size_t original_dim = dim - extra_dim;
+
+  float ip;
+  inner_product_fp16_distance(a, b, original_dim, &ip);
+
+  *distance = 1 - ip;
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX__
+}
+
+void cosine_fp16_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__AVX512FP16__)
+  constexpr size_t extra_dim = 2;
+  const size_t original_dim = dim - extra_dim;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  inner_product_fp16_batch_distance(vectors, query, n, original_dim, distances);
+
+  for (size_t i = 0; i < n; ++i) {
+    distances[i] = 1 - distances[i];
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX__
+}
+
+}  // namespace zvec::turbo::avx512_fp16
\ No newline at end of file
diff --git a/src/turbo/avx512_fp16/half_float/cosine.h b/src/turbo/avx512_fp16/half_float/cosine.h
new file mode 100644
index 000000000..2b57bcf9e
--- /dev/null
+++ b/src/turbo/avx512_fp16/half_float/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx512_fp16 {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized FP32 vector pair.
+void cosine_fp16_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_fp32_distance.
+void cosine_fp16_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx512_fp16
\ No newline at end of file
diff --git a/src/turbo/avx512_fp16/half_float/inner_product.cc b/src/turbo/avx512_fp16/half_float/inner_product.cc
new file mode 100644
index 000000000..c7262577d
--- /dev/null
+++ b/src/turbo/avx512_fp16/half_float/inner_product.cc
@@ -0,0 +1,112 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+
+#if defined(__AVX512FP16__)
+#include <immintrin.h>
+#include <zvec/ailego/utility/float_helper.h>
+#include "avx512_fp16/half_float/inner_product.h"
+#include "avx512_fp16/half_float/inner_product_common.h"
+
+using namespace zvec::ailego;
+
+using namespace zvec::turbo::avx512_fp16::internal;
+
+#endif
+
+namespace zvec::turbo::avx512_fp16 {
+
+// Compute squared Euclidean distance between a single quantized FP16
+// vector pair.
+void inner_product_fp16_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__AVX512FP16__)
+  const Float16 *lhs = reinterpret_cast<const Float16 *>(a);
+  const Float16 *rhs = reinterpret_cast<const Float16 *>(b);
+
+  const Float16 *last = lhs + dim;
+  const Float16 *last_aligned = lhs + ((dim >> 6) << 6);
+
+  __m512h zmm_sum_0 = _mm512_setzero_ph();
+  __m512h zmm_sum_1 = _mm512_setzero_ph();
+
+  if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) {
+    for (; lhs != last_aligned; lhs += 64, rhs += 64) {
+      FMA_FP16_AVX512FP16(_mm512_load_ph(lhs + 0), _mm512_load_ph(rhs + 0),
+                          zmm_sum_0)
+
+      FMA_FP16_AVX512FP16(_mm512_load_ph(lhs + 32), _mm512_load_ph(rhs + 32),
+                          zmm_sum_1)
+    }
+
+    if (last >= last_aligned + 32) {
+      FMA_FP16_AVX512FP16(_mm512_load_ph(lhs), _mm512_load_ph(rhs), zmm_sum_0)
+      lhs += 32;
+      rhs += 32;
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 64, rhs += 64) {
+      FMA_FP16_AVX512FP16(_mm512_loadu_ph(lhs + 0), _mm512_loadu_ph(rhs + 0),
+                          zmm_sum_0)
+
+      FMA_FP16_AVX512FP16(_mm512_loadu_ph(lhs + 32), _mm512_loadu_ph(rhs + 32),
+                          zmm_sum_1)
+    }
+
+    if (last >= last_aligned + 32) {
+      FMA_FP16_AVX512FP16(_mm512_loadu_ph(lhs), _mm512_loadu_ph(rhs), zmm_sum_0)
+      lhs += 32;
+      rhs += 32;
+    }
+  }
+
+  zmm_sum_0 = _mm512_add_ph(zmm_sum_0, zmm_sum_1);
+
+  if (lhs != last) {
+    __mmask32 mask = (__mmask32)((1 << (last - lhs)) - 1);
+    __m512i zmm_undefined = _mm512_undefined_epi32();
+    zmm_sum_0 = _mm512_mask3_fmadd_ph(
+        _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, lhs)),
+        _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, rhs)),
+        zmm_sum_0, mask);
+  }
+
+  *distance = -1 * HorizontalAdd_FP16_V512(zmm_sum_0);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif
+}
+
+// Batch version of inner_product_fp16_distance.
+void inner_product_fp16_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__AVX512FP16__)
+  for (size_t i = 0; i < n; ++i) {
+    inner_product_fp16_distance(vectors[i], query, dim, &distances[i]);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  // __AVX512FP16__
+}
+
+}  // namespace zvec::turbo::avx512_fp16
\ No newline at end of file
diff --git a/src/turbo/avx512_fp16/half_float/inner_product.h b/src/turbo/avx512_fp16/half_float/inner_product.h
new file mode 100644
index 000000000..a80944713
--- /dev/null
+++ b/src/turbo/avx512_fp16/half_float/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx512_fp16 {
+
+// Compute inner product distance between a single quantized FP16
+// vector pair.
+void inner_product_fp16_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_fp32_distance.
+void inner_product_fp16_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::avx512_fp16
diff --git a/src/turbo/avx512_fp16/half_float/inner_product_common.h b/src/turbo/avx512_fp16/half_float/inner_product_common.h
new file mode 100644
index 000000000..30921e038
--- /dev/null
+++ b/src/turbo/avx512_fp16/half_float/inner_product_common.h
@@ -0,0 +1,53 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX512FP16__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+
+namespace zvec::turbo::avx512_fp16::internal {
+
+//! Calculate Fused-Multiply-Add (AVX512FP16)
+#define FMA_FP16_AVX512FP16(zmm_m, zmm_q, zmm_sum) \
+  zmm_sum = _mm512_fmadd_ph(zmm_m, zmm_q, zmm_sum);
+
+static inline float HorizontalAdd_FP32_V256(__m256 v) {
+  __m256 x1 = _mm256_hadd_ps(v, v);
+  __m256 x2 = _mm256_hadd_ps(x1, x1);
+  __m128 x3 = _mm256_extractf128_ps(x2, 1);
+  __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3);
+  return _mm_cvtss_f32(x4);
+}
+
+static inline float HorizontalAdd_FP32_V512(__m512 v) {
+  __m256 low = _mm512_castps512_ps256(v);
+  __m256 high =
+      _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1));
+  return HorizontalAdd_FP32_V256(_mm256_add_ps(low, high));
+}
+
+static inline float HorizontalAdd_FP16_V512(__m512h v) {
+  __m512 low = _mm512_cvtxph_ps(_mm512_castph512_ph256(v));
+  __m512 high = _mm512_cvtxph_ps(
+      _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(v), 1)));
+
+  return HorizontalAdd_FP32_V512(_mm512_add_ps(low, high));
+}
+
+}  // namespace zvec::turbo::avx512_fp16::internal
+
+#endif  // defined(__AVX512FP16__)
diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean.cc b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc
new file mode 100644
index 000000000..5e33255b3
--- /dev/null
+++ b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc
@@ -0,0 +1,114 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+#include <zvec/ailego/utility/float_helper.h>
+#include "avx512_fp16/half_float/squared_euclidean.h"
+#include "avx512_fp16/half_float/squared_euclidean_common.h"
+
+using namespace zvec::ailego;
+
+using namespace zvec::turbo::avx512_fp16::internal;
+
+#endif
+
+namespace zvec::turbo::avx512_fp16 {
+
+void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__AVX512FP16__)
+  const Float16 *lhs = reinterpret_cast<const Float16 *>(a);
+  const Float16 *rhs = reinterpret_cast<const Float16 *>(b);
+
+  const Float16 *last = lhs + dim;
+  const Float16 *last_aligned = lhs + ((dim >> 6) << 6);
+
+  __m512h zmm_sum_0 = _mm512_setzero_ph();
+  __m512h zmm_sum_1 = _mm512_setzero_ph();
+
+  if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) {
+    for (; lhs != last_aligned; lhs += 64, rhs += 64) {
+      __m512h zmm_d_0 =
+          _mm512_sub_ph(_mm512_load_ph(lhs + 0), _mm512_load_ph(rhs + 0));
+      __m512h zmm_d_1 =
+          _mm512_sub_ph(_mm512_load_ph(lhs + 32), _mm512_load_ph(rhs + 32));
+      zmm_sum_0 = _mm512_fmadd_ph(zmm_d_0, zmm_d_0, zmm_sum_0);
+      zmm_sum_1 = _mm512_fmadd_ph(zmm_d_1, zmm_d_1, zmm_sum_1);
+    }
+
+    if (last >= last_aligned + 32) {
+      __m512h zmm_d = _mm512_sub_ph(_mm512_load_ph(lhs), _mm512_load_ph(rhs));
+      zmm_sum_0 = _mm512_fmadd_ph(zmm_d, zmm_d, zmm_sum_0);
+      lhs += 32;
+      rhs += 32;
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 64, rhs += 64) {
+      __m512h zmm_d_0 =
+          _mm512_sub_ph(_mm512_loadu_ph(lhs + 0), _mm512_loadu_ph(rhs + 0));
+      __m512h zmm_d_1 =
+          _mm512_sub_ph(_mm512_loadu_ph(lhs + 32), _mm512_loadu_ph(rhs + 32));
+      zmm_sum_0 = _mm512_fmadd_ph(zmm_d_0, zmm_d_0, zmm_sum_0);
+      zmm_sum_1 = _mm512_fmadd_ph(zmm_d_1, zmm_d_1, zmm_sum_1);
+    }
+
+    if (last >= last_aligned + 32) {
+      __m512h zmm_d = _mm512_sub_ph(_mm512_loadu_ph(lhs), _mm512_loadu_ph(rhs));
+      zmm_sum_0 = _mm512_fmadd_ph(zmm_d, zmm_d, zmm_sum_0);
+      lhs += 32;
+      rhs += 32;
+    }
+  }
+
+  zmm_sum_0 = _mm512_add_ph(zmm_sum_0, zmm_sum_1);
+  if (lhs != last) {
+    __mmask32 mask = (__mmask32)((1 << (last - lhs)) - 1);
+    __m512i zmm_undefined = _mm512_undefined_epi32();
+    __m512h zmm_undefined_ph = _mm512_undefined_ph();
+    __m512h zmm_d = _mm512_mask_sub_ph(
+        zmm_undefined_ph, mask,
+        _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, lhs)),
+        _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, rhs)));
+    zmm_sum_0 = _mm512_mask3_fmadd_ph(zmm_d, zmm_d, zmm_sum_0, mask);
+  }
+
+  *distance = HorizontalAdd_FP16_V512(zmm_sum_0);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __AVX512FP16__
+}
+
+void squared_euclidean_fp32_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__AVX512FP16__)
+  for (size_t i = 0; i < n; ++i) {
+    squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__AVX512FP16__
+}
+
+}  // namespace zvec::turbo::avx512_fp16
\ No newline at end of file
diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean.h b/src/turbo/avx512_fp16/half_float/squared_euclidean.h
new file mode 100644
index 000000000..669749f51
--- /dev/null
+++ b/src/turbo/avx512_fp16/half_float/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx512_fp16 {
+
+// Compute squared euclidean distance between a single quantized FP32
+// vector pair.
+void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean FP32.
+void squared_euclidean_fp16_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::avx512_fp16
diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h b/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h
new file mode 100644
index 000000000..b5f91988e
--- /dev/null
+++ b/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h
@@ -0,0 +1,49 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__AVX512FP16__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+
+namespace zvec::turbo::avx512_fp16::internal {
+
+static inline float HorizontalAdd_FP32_V256(__m256 v) {
+  __m256 x1 = _mm256_hadd_ps(v, v);
+  __m256 x2 = _mm256_hadd_ps(x1, x1);
+  __m128 x3 = _mm256_extractf128_ps(x2, 1);
+  __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3);
+  return _mm_cvtss_f32(x4);
+}
+
+static inline float HorizontalAdd_FP32_V512(__m512 v) {
+  __m256 low = _mm512_castps512_ps256(v);
+  __m256 high =
+      _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1));
+  return HorizontalAdd_FP32_V256(_mm256_add_ps(low, high));
+}
+
+static inline float HorizontalAdd_FP16_V512(__m512h v) {
+  __m512 low = _mm512_cvtxph_ps(_mm512_castph512_ph256(v));
+  __m512 high = _mm512_cvtxph_ps(
+      _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(v), 1)));
+
+  return HorizontalAdd_FP32_V512(_mm512_add_ps(low, high));
+}
+
+}  // namespace zvec::turbo::avx512_fp16::internal
+
+#endif  // defined(__AVX512FP16__)
diff --git a/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc
new file mode 100644
index 000000000..db83b128a
--- /dev/null
+++ b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc
@@ -0,0 +1,65 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx512_vnni/record_quantized_int8/inner_product.h"
+#include <cstdint>
+#include "avx512_vnni/record_quantized_int8/common.h"
+
+namespace zvec::turbo::avx512_vnni {
+
+// Compute squared Euclidean distance between a single quantized int8
+// vector pair.
+void inner_product_int8_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+  const size_t original_dim = dim - 20;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::ip_int8_avx512_vnni(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms +
+                original_dim * qb * mb);
+}
+
+// Batch version of inner_product_int8_distance.
+void inner_product_int8_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__AVX512VNNI__)
+  internal::ip_int8_batch_avx512_vnni(vectors, query, n, dim, distances);
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  // __AVX512VNNI__
+}
+
+}  // namespace zvec::turbo::avx512_vnni
\ No newline at end of file
diff --git a/src/turbo/avx512_vnni/record_quantized_int8/inner_product.h b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.h
new file mode 100644
index 000000000..25f0ce109
--- /dev/null
+++ b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::avx512_vnni {
+
+// Compute inner product distance between a single quantized int8
+// vector pair.
+void inner_product_int8_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_int8_distance.
+void inner_product_int8_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::avx512_vnni
diff --git a/src/turbo/quantizer/RecordInt4Quantizer/record_int4_quantizer.cc b/src/turbo/quantizer/RecordInt4Quantizer/record_int4_quantizer.cc
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/turbo/quantizer/RecordInt8Quantizer/record_int8_quantizer.cc b/src/turbo/quantizer/RecordInt8Quantizer/record_int8_quantizer.cc
new file mode 100644
index 000000000..72617e56b
--- /dev/null
+++ b/src/turbo/quantizer/RecordInt8Quantizer/record_int8_quantizer.cc
@@ -0,0 +1,21 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <zvec/turbo/quantizer/record_int8_quantizer.h>
+
+#pragma once
+
+namespace zvec {
+namespace turbo {}  // namespace turbo
+}  // namespace zvec
\ No newline at end of file
diff --git a/src/turbo/quantizer/RecordInt8Quantizer/reocrd_int8_quantier.h b/src/turbo/quantizer/RecordInt8Quantizer/reocrd_int8_quantier.h
new file mode 100644
index 000000000..8e083ae25
--- /dev/null
+++ b/src/turbo/quantizer/RecordInt8Quantizer/reocrd_int8_quantier.h
@@ -0,0 +1,48 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <zvec/core/framework/index_meta.h>
+#include <zvec/turbo/quantizer/quantizer.h>
+
+#pragma once
+
+namespace zvec {
+namespace turbo {
+
+class RecordInt8Quantizer : public Quantizer {
+ public:
+  RecordInt8Quantizer() : type_{QuantizeType::kRecordInt8} {}
+
+  virtual ~RecordInt8Quantizer() {}
+
+ public:
+  QuantizeType type() const override {
+    return type_;
+  }
+
+  const IndexMeta &meta(void) const override {
+    return meta_;
+  }
+
+ private:
+  IndexMeta meta_{};
+  IndexHolder::Pointer holder_{};
+  std::shared_ptr<Quantizer> quantizer_{};
+  Stats stats_{};
+  IndexMeta::DataType data_type_{};
+};
+
+
+}  // namespace turbo
+}  // namespace zvec
diff --git a/src/turbo/quantizer/quantizer.h b/src/turbo/quantizer/quantizer.h
new file mode 100644
index 000000000..b051a6d87
--- /dev/null
+++ b/src/turbo/quantizer/quantizer.h
@@ -0,0 +1,33 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <zvec/core/framework/index_meta.h>
+#include <zvec/turbo/turbo.h>
+
+#pragma once
+
+namespace zvec {
+namespace turbo {
+
+class Quantizer {
+ public:
+  Quantizer() {};
+  virtual ~Quantizer() {};
+
+ private:
+  QuantizeType type_{QuantizeType::kDefault};
+};
+
+}  // namespace turbo
+}  // namespace zvec
diff --git a/src/turbo/scalar/float32/cosine.cc b/src/turbo/scalar/float32/cosine.cc
new file mode 100644
index 000000000..cffb0b166
--- /dev/null
+++ b/src/turbo/scalar/float32/cosine.cc
@@ -0,0 +1,39 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/float32/cosine.h"
+#include "scalar/float32/inner_product.h"
+
+namespace zvec::turbo::scalar {
+
+void cosine_fp32_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+  constexpr size_t extra_dim = 1;
+  size_t original_dim = dim - extra_dim;
+
+  float ip;
+  inner_product_fp32_distance(a, b, original_dim, &ip);
+
+  *distance = 1 - ip;
+}
+
+void cosine_fp32_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+  inner_product_fp32_batch_distance(vectors, query, n, dim, distances);
+  for (size_t i = 0; i < n; i++) {
+    distances[i] = 1 - distances[i];
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/float32/cosine.h b/src/turbo/scalar/float32/cosine.h
new file mode 100644
index 000000000..b5e4f4eee
--- /dev/null
+++ b/src/turbo/scalar/float32/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized FP32 vector pair.
+void cosine_fp32_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_fp32_distance.
+void cosine_fp32_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/float32/inner_product.cc b/src/turbo/scalar/float32/inner_product.cc
new file mode 100644
index 000000000..23a282ef3
--- /dev/null
+++ b/src/turbo/scalar/float32/inner_product.cc
@@ -0,0 +1,43 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/float32/inner_product.h"
+
+namespace zvec::turbo::scalar {
+
+// Compute squared Euclidean distance between a single quantized FP32
+// vector pair.
+void inner_product_fp32_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+  const float *m = reinterpret_cast<const float *>(a);
+  const float *q = reinterpret_cast<const float *>(b);
+
+  float sum = 0.0;
+  for (size_t i = 0; i < dim; ++i) {
+    sum += static_cast<float>(m[i] * q[i]);
+  }
+
+  *distance = -sum;
+}
+
+// Batch version of inner_product_fp32_distance.
+void inner_product_fp32_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    inner_product_fp32_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/float32/inner_product.h b/src/turbo/scalar/float32/inner_product.h
new file mode 100644
index 000000000..d4e03418e
--- /dev/null
+++ b/src/turbo/scalar/float32/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute inner product distance between a single quantized FP32
+// vector pair.
+void inner_product_fp32_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_fp32_distance.
+void inner_product_fp32_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::scalar
diff --git a/src/turbo/scalar/float32/squared_euclidean.cc b/src/turbo/scalar/float32/squared_euclidean.cc
new file mode 100644
index 000000000..a3ffd10bb
--- /dev/null
+++ b/src/turbo/scalar/float32/squared_euclidean.cc
@@ -0,0 +1,41 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/float32/squared_euclidean.h"
+#include <ailego/utility/math_helper.h>
+
+namespace zvec::turbo::scalar {
+
+void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+  const float *m = reinterpret_cast<const float *>(a);
+  const float *q = reinterpret_cast<const float *>(b);
+
+  float sum = 0.0;
+  for (size_t i = 0; i < dim; ++i) {
+    sum += zvec::ailego::MathHelper::SquaredDifference(m[i], q[i]);
+  }
+
+  *distance = sum;
+}
+
+void squared_euclidean_fp32_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    squared_euclidean_fp32_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/float32/squared_euclidean.h b/src/turbo/scalar/float32/squared_euclidean.h
new file mode 100644
index 000000000..bf319c1d2
--- /dev/null
+++ b/src/turbo/scalar/float32/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute squared euclidean distance between a single quantized FP32
+// vector pair.
+void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean FP32.
+void squared_euclidean_fp32_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::scalar
diff --git a/src/turbo/scalar/half_float/cosine.cc b/src/turbo/scalar/half_float/cosine.cc
new file mode 100644
index 000000000..3c7a39550
--- /dev/null
+++ b/src/turbo/scalar/half_float/cosine.cc
@@ -0,0 +1,38 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/half_float/cosine.h"
+#include "scalar/half_float/inner_product.h"
+
+namespace zvec::turbo::scalar {
+
+void cosine_fp16_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+  constexpr size_t extra_dim = 2;
+  size_t original_dim = dim - extra_dim;
+
+  float ip;
+  inner_product_fp16_distance(a, b, original_dim, &ip);
+
+  *distance = 1 - ip;
+}
+
+void cosine_fp16_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    cosine_fp16_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/half_float/cosine.h b/src/turbo/scalar/half_float/cosine.h
new file mode 100644
index 000000000..cb82bc893
--- /dev/null
+++ b/src/turbo/scalar/half_float/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized FP16 vector pair.
+void cosine_fp16_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_fp16_distance.
+void cosine_fp16_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/half_float/inner_product.cc b/src/turbo/scalar/half_float/inner_product.cc
new file mode 100644
index 000000000..d06c45b25
--- /dev/null
+++ b/src/turbo/scalar/half_float/inner_product.cc
@@ -0,0 +1,46 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/half_float/inner_product.h"
+#include <zvec/ailego/utility/float_helper.h>
+
+namespace zvec::turbo::scalar {
+
+// Compute squared Euclidean distance between a single quantized FP16
+// vector pair.
+void inner_product_fp16_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+  const zvec::ailego::Float16 *m =
+      reinterpret_cast<const zvec::ailego::Float16 *>(a);
+  const zvec::ailego::Float16 *q =
+      reinterpret_cast<const zvec::ailego::Float16 *>(b);
+
+  float sum = 0.0;
+  for (size_t i = 0; i < dim; ++i) {
+    sum += static_cast<float>(m[i] * q[i]);
+  }
+
+  *distance = -sum;
+}
+
+// Batch version of inner_product_fp16_distance.
+void inner_product_fp16_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    inner_product_fp16_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/half_float/inner_product.h b/src/turbo/scalar/half_float/inner_product.h
new file mode 100644
index 000000000..98fc4cba4
--- /dev/null
+++ b/src/turbo/scalar/half_float/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute inner product distance between a single quantized FP16
+// vector pair.
+void inner_product_fp16_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_fp16_distance.
+void inner_product_fp16_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::scalar
diff --git a/src/turbo/scalar/half_float/squared_euclidean.cc b/src/turbo/scalar/half_float/squared_euclidean.cc
new file mode 100644
index 000000000..c3f6b3c2e
--- /dev/null
+++ b/src/turbo/scalar/half_float/squared_euclidean.cc
@@ -0,0 +1,43 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/half_float/squared_euclidean.h"
+#include <ailego/utility/math_helper.h>
+
+namespace zvec::turbo::scalar {
+
+void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+  const zvec::ailego::Float16 *m =
+      reinterpret_cast<const zvec::ailego::Float16 *>(a);
+  const zvec::ailego::Float16 *q =
+      reinterpret_cast<const zvec::ailego::Float16 *>(b);
+
+  float sum = 0.0;
+  for (size_t i = 0; i < dim; ++i) {
+    sum += zvec::ailego::MathHelper::SquaredDifference(m[i], q[i]);
+  }
+
+  *distance = sum;
+}
+
+void squared_euclidean_fp16_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/half_float/squared_euclidean.h b/src/turbo/scalar/half_float/squared_euclidean.h
new file mode 100644
index 000000000..8865cd1c2
--- /dev/null
+++ b/src/turbo/scalar/half_float/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute squared euclidean distance between a single quantized FP16
+// vector pair.
+void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean FP32.
+void squared_euclidean_fp16_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::scalar
diff --git a/src/turbo/scalar/record_quantized_int4/common.h b/src/turbo/scalar/record_quantized_int4/common.h
new file mode 100644
index 000000000..f4b74d7d3
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int4/common.h
@@ -0,0 +1,59 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <zvec/ailego/internal/platform.h>
+
+namespace zvec::turbo::scalar::internal {
+
+/*! Four-bits Integer Multiplication Table
+ */
+static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = {
+    0, 0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0, 1,  2,   3,   4,   5,   6,   7,   -8,  -7,  -6,  -5,  -4,  -3,  -2,  -1,
+    0, 2,  4,   6,   8,   10,  12,  14,  -16, -14, -12, -10, -8,  -6,  -4,  -2,
+    0, 3,  6,   9,   12,  15,  18,  21,  -24, -21, -18, -15, -12, -9,  -6,  -3,
+    0, 4,  8,   12,  16,  20,  24,  28,  -32, -28, -24, -20, -16, -12, -8,  -4,
+    0, 5,  10,  15,  20,  25,  30,  35,  -40, -35, -30, -25, -20, -15, -10, -5,
+    0, 6,  12,  18,  24,  30,  36,  42,  -48, -42, -36, -30, -24, -18, -12, -6,
+    0, 7,  14,  21,  28,  35,  42,  49,  -56, -49, -42, -35, -28, -21, -14, -7,
+    0, -8, -16, -24, -32, -40, -48, -56, 64,  56,  48,  40,  32,  24,  16,  8,
+    0, -7, -14, -21, -28, -35, -42, -49, 56,  49,  42,  35,  28,  21,  14,  7,
+    0, -6, -12, -18, -24, -30, -36, -42, 48,  42,  36,  30,  24,  18,  12,  6,
+    0, -5, -10, -15, -20, -25, -30, -35, 40,  35,  30,  25,  20,  15,  10,  5,
+    0, -4, -8,  -12, -16, -20, -24, -28, 32,  28,  24,  20,  16,  12,  8,   4,
+    0, -3, -6,  -9,  -12, -15, -18, -21, 24,  21,  18,  15,  12,  9,   6,   3,
+    0, -2, -4,  -6,  -8,  -10, -12, -14, 16,  14,  12,  10,  8,   6,   4,   2,
+    0, -1, -2,  -3,  -4,  -5,  -6,  -7,  8,   7,   6,   5,   4,   3,   2,   1,
+};
+
+static __attribute__((always_inline)) void inner_product_int4_scalar(
+    const void *a, const void *b, size_t dim, float *distance) {
+  const uint8_t *m = reinterpret_cast<const uint8_t *>(a);
+  const uint8_t *q = reinterpret_cast<const uint8_t *>(b);
+
+  float sum = 0.0;
+  for (size_t i = 0; i < dim; ++i) {
+    uint8_t m_val = m[i];
+    uint8_t q_val = q[i];
+    sum += Int4MulTable[((m_val << 4) & 0xf0) | ((q_val >> 0) & 0xf)] +
+           Int4MulTable[((m_val >> 0) & 0xf0) | ((q_val >> 4) & 0xf)];
+  }
+
+  *distance = sum;
+}
+
+}  // namespace zvec::turbo::scalar::internal
\ No newline at end of file
diff --git a/src/turbo/scalar/record_quantized_int4/cosine.cc b/src/turbo/scalar/record_quantized_int4/cosine.cc
new file mode 100644
index 000000000..cab09202d
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int4/cosine.cc
@@ -0,0 +1,55 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/record_quantized_int4/cosine.h"
+#include "scalar/record_quantized_int4/common.h"
+
+namespace zvec::turbo::scalar {
+
+void cosine_int4_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+  const int d = dim - 40;
+  const size_t original_dim = d >> 1;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_scalar(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms +
+                static_cast<float>(d) * qb * mb);
+}
+
+void cosine_int4_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    cosine_int4_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/record_quantized_int4/cosine.h b/src/turbo/scalar/record_quantized_int4/cosine.h
new file mode 100644
index 000000000..25838aa02
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int4/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized int4 vector pair.
+void cosine_int4_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_int4_distance.
+void cosine_int4_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/record_quantized_int4/inner_product.cc b/src/turbo/scalar/record_quantized_int4/inner_product.cc
new file mode 100644
index 000000000..02bdec849
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int4/inner_product.cc
@@ -0,0 +1,59 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/record_quantized_int4/inner_product.h"
+#include "scalar/record_quantized_int4/common.h"
+
+namespace zvec::turbo::scalar {
+
+// Compute squared Euclidean distance between a single quantized int4
+// vector pair.
+void inner_product_int4_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+  const int d = dim - 32;
+  const size_t original_dim = d >> 1;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_scalar(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance =
+      -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + d * qb * mb);
+}
+
+// Batch version of inner_product_int4_distance.
+void inner_product_int4_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    inner_product_int4_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/record_quantized_int4/inner_product.h b/src/turbo/scalar/record_quantized_int4/inner_product.h
new file mode 100644
index 000000000..b34d47aa4
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int4/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute inner product distance between a single quantized int4
+// vector pair.
+void inner_product_int4_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_int4_distance.
+void inner_product_int4_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::scalar
diff --git a/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc
new file mode 100644
index 000000000..555f96246
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc
@@ -0,0 +1,61 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/record_quantized_int4/squared_euclidean.h"
+#include "scalar/record_quantized_int4/common.h"
+
+namespace zvec::turbo::scalar {
+
+void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+  const int d = dim - 32;
+  const size_t original_dim = d >> 1;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_scalar(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+  float qs2 = a_tail[3];
+
+  const float sum = qa * qs;
+  const float sum2 = qa * qa * qs2;
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+  float ms2 = b_tail[3];
+
+  *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance +
+              (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum);
+}
+
+void squared_euclidean_int4_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    squared_euclidean_int4_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/record_quantized_int4/squared_euclidean.h b/src/turbo/scalar/record_quantized_int4/squared_euclidean.h
new file mode 100644
index 000000000..ea37cfdec
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int4/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute squared euclidean distance between a single quantized INT8
+// vector pair.
+void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean  INT8.
+void squared_euclidean_int4_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::scalar
diff --git a/src/turbo/scalar/record_quantized_int8/common.h b/src/turbo/scalar/record_quantized_int8/common.h
new file mode 100644
index 000000000..d0b7186ae
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int8/common.h
@@ -0,0 +1,34 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+
+namespace zvec::turbo::scalar::internal {
+
+static __attribute__((always_inline)) void inner_product_int8_scalar(
+    const void *a, const void *b, size_t dim, float *distance) {
+  const int8_t *m = reinterpret_cast<const int8_t *>(a);
+  const int8_t *q = reinterpret_cast<const int8_t *>(b);
+
+  float sum = 0.0;
+  for (size_t i = 0; i < dim; ++i) {
+    sum += static_cast<float>(m[i] * q[i]);
+  }
+
+  *distance = -sum;
+}
+
+}  // namespace zvec::turbo::scalar::internal
diff --git a/src/turbo/scalar/record_quantized_int8/cosine.cc b/src/turbo/scalar/record_quantized_int8/cosine.cc
new file mode 100644
index 000000000..fe5faf8e7
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int8/cosine.cc
@@ -0,0 +1,56 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/record_quantized_int8/cosine.h"
+#include <cstdint>
+#include "scalar/record_quantized_int8/common.h"
+
+namespace zvec::turbo::scalar {
+
+void cosine_int8_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+  const int original_dim = dim - 24;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int8_scalar(a, b, original_dim, distance);
+  *distance = -*distance;
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms +
+                original_dim * qb * mb);
+}
+
+void cosine_int8_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    cosine_int8_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/record_quantized_int8/cosine.h b/src/turbo/scalar/record_quantized_int8/cosine.h
new file mode 100644
index 000000000..e06d8b234
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int8/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized int8 vector pair.
+void cosine_int8_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_int8_distance.
+void cosine_int8_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/record_quantized_int8/inner_product.cc b/src/turbo/scalar/record_quantized_int8/inner_product.cc
new file mode 100644
index 000000000..e33cdac12
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int8/inner_product.cc
@@ -0,0 +1,61 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/record_quantized_int8/inner_product.h"
+#include <cstdint>
+#include "scalar/record_quantized_int8/common.h"
+
+namespace zvec::turbo::scalar {
+
+// Compute squared Euclidean distance between a single quantized int8
+// vector pair.
+void inner_product_int8_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+  const size_t original_dim = dim - 20;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int8_scalar(a, b, original_dim, distance);
+
+  *distance = -1 * *distance;
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms +
+                original_dim * qb * mb);
+}
+
+// Batch version of inner_product_int8_distance.
+void inner_product_int8_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    inner_product_int8_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/record_quantized_int8/inner_product.h b/src/turbo/scalar/record_quantized_int8/inner_product.h
new file mode 100644
index 000000000..1ed51489a
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int8/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute inner product distance between a single quantized int8
+// vector pair.
+void inner_product_int8_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_int8_distance.
+void inner_product_int8_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::scalar
diff --git a/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc
new file mode 100644
index 000000000..d05d1a049
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc
@@ -0,0 +1,61 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scalar/record_quantized_int8/squared_euclidean.h"
+#include "scalar/record_quantized_int8/common.h"
+
+namespace zvec::turbo::scalar {
+
+void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+  const int original_dim = dim - 20;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int8_scalar(a, b, original_dim, distance);
+  *distance = -*distance;
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(b) + original_dim);
+
+  float ma = a_tail[0];
+  float mb = a_tail[1];
+  float ms = a_tail[2];
+  float ms2 = a_tail[3];
+
+  float qa = b_tail[0];
+  float qb = b_tail[1];
+  float qs = b_tail[2];
+  float qs2 = b_tail[3];
+
+  const float sum = qa * qs;
+  const float sum2 = qa * qa * qs2;
+
+  *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance +
+              (mb - qb) * (mb - qb) * original_dim +
+              2 * (mb - qb) * (ms * ma - sum);
+}
+
+void squared_euclidean_int8_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+  for (size_t i = 0; i < n; ++i) {
+    squared_euclidean_int8_distance(vectors[i], query, dim, &distances[i]);
+  }
+}
+
+}  // namespace zvec::turbo::scalar
\ No newline at end of file
diff --git a/src/turbo/scalar/record_quantized_int8/squared_euclidean.h b/src/turbo/scalar/record_quantized_int8/squared_euclidean.h
new file mode 100644
index 000000000..07db60519
--- /dev/null
+++ b/src/turbo/scalar/record_quantized_int8/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::scalar {
+
+// Compute squared euclidean distance between a single quantized INT8
+// vector pair.
+void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean  INT8.
+void squared_euclidean_int8_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::scalar
diff --git a/src/turbo/sse/record_quantized_int4/common.h b/src/turbo/sse/record_quantized_int4/common.h
new file mode 100644
index 000000000..623d6365a
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int4/common.h
@@ -0,0 +1,174 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__SSE4_1__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+#include <zvec/ailego/internal/platform.h>
+
+namespace zvec::turbo::sse::internal {
+
+//! Four-bits Convert Table
+static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = {
+    0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1,
+    0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1};
+
+/*! Four-bits Integer Multiplication Table
+ */
+static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = {
+    0, 0,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0, 1,  2,   3,   4,   5,   6,   7,   -8,  -7,  -6,  -5,  -4,  -3,  -2,  -1,
+    0, 2,  4,   6,   8,   10,  12,  14,  -16, -14, -12, -10, -8,  -6,  -4,  -2,
+    0, 3,  6,   9,   12,  15,  18,  21,  -24, -21, -18, -15, -12, -9,  -6,  -3,
+    0, 4,  8,   12,  16,  20,  24,  28,  -32, -28, -24, -20, -16, -12, -8,  -4,
+    0, 5,  10,  15,  20,  25,  30,  35,  -40, -35, -30, -25, -20, -15, -10, -5,
+    0, 6,  12,  18,  24,  30,  36,  42,  -48, -42, -36, -30, -24, -18, -12, -6,
+    0, 7,  14,  21,  28,  35,  42,  49,  -56, -49, -42, -35, -28, -21, -14, -7,
+    0, -8, -16, -24, -32, -40, -48, -56, 64,  56,  48,  40,  32,  24,  16,  8,
+    0, -7, -14, -21, -28, -35, -42, -49, 56,  49,  42,  35,  28,  21,  14,  7,
+    0, -6, -12, -18, -24, -30, -36, -42, 48,  42,  36,  30,  24,  18,  12,  6,
+    0, -5, -10, -15, -20, -25, -30, -35, 40,  35,  30,  25,  20,  15,  10,  5,
+    0, -4, -8,  -12, -16, -20, -24, -28, 32,  28,  24,  20,  16,  12,  8,   4,
+    0, -3, -6,  -9,  -12, -15, -18, -21, 24,  21,  18,  15,  12,  9,   6,   3,
+    0, -2, -4,  -6,  -8,  -10, -12, -14, 16,  14,  12,  10,  8,   6,   4,   2,
+    0, -1, -2,  -3,  -4,  -5,  -6,  -7,  8,   7,   6,   5,   4,   3,   2,   1,
+};
+
+//! Calculate Fused-Multiply-Add (GENERAL)
+#define FMA_INT4_GENERAL(m, q, sum)                               \
+  sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \
+         Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)];
+
+#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f)
+#define ONES_INT16_SSE _mm_set1_epi32(0x00010001)
+#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable)
+
+//! Compute the distance between matrix and query
+#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum)                       \
+  {                                                                        \
+    __m128i xmm_lhs_0 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE));         \
+    __m128i xmm_rhs_0 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE));         \
+    __m128i xmm_lhs_1 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE,                                                   \
+        _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE));       \
+    __m128i xmm_rhs_1 = _mm_shuffle_epi8(                                  \
+        INT4_LOOKUP_SSE,                                                   \
+        _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE));       \
+    xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0);                       \
+    xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1);                       \
+    xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0);                                   \
+    xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1);                                   \
+    xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0),    \
+                               ONES_INT16_SSE);                            \
+    xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1),    \
+                               ONES_INT16_SSE);                            \
+    xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \
+  }
+
+static inline int32_t HorizontalAdd_INT32_V128(__m128i v) {
+#ifdef __SSE3__
+  __m128i x1 = _mm_hadd_epi32(v, v);
+  __m128i x2 = _mm_hadd_epi32(x1, x1);
+  return _mm_cvtsi128_si32(x2);
+#else
+  __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2));
+  __m128i x2 = _mm_add_epi32(v, x1);
+  __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1));
+  __m128i x4 = _mm_add_epi32(x2, x3);
+  return _mm_cvtsi128_si32(x4);
+#endif
+}
+
+static __attribute__((always_inline)) void inner_product_int4_sse(
+    const void *a, const void *b, size_t size, float *distance) {
+  const uint8_t *lhs = reinterpret_cast<const uint8_t *>(a);
+  const uint8_t *rhs = reinterpret_cast<const uint8_t *>(b);
+
+  const uint8_t *last = lhs + size;
+  const uint8_t *last_aligned = lhs + ((size >> 4) << 4);
+  __m128i xmm_sum = _mm_setzero_si128();
+
+  if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) {
+    for (; lhs != last_aligned; lhs += 16, rhs += 16) {
+      __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs));
+      __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs));
+      FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum)
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 16, rhs += 16) {
+      __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs));
+      __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs));
+      FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum)
+    }
+  }
+  float result = static_cast<float>(HorizontalAdd_INT32_V128(xmm_sum));
+
+  switch (last - lhs) {
+    case 15:
+      FMA_INT4_GENERAL(lhs[14], rhs[14], result)
+      /* FALLTHRU */
+    case 14:
+      FMA_INT4_GENERAL(lhs[13], rhs[13], result)
+      /* FALLTHRU */
+    case 13:
+      FMA_INT4_GENERAL(lhs[12], rhs[12], result)
+      /* FALLTHRU */
+    case 12:
+      FMA_INT4_GENERAL(lhs[11], rhs[11], result)
+      /* FALLTHRU */
+    case 11:
+      FMA_INT4_GENERAL(lhs[10], rhs[10], result)
+      /* FALLTHRU */
+    case 10:
+      FMA_INT4_GENERAL(lhs[9], rhs[9], result)
+      /* FALLTHRU */
+    case 9:
+      FMA_INT4_GENERAL(lhs[8], rhs[8], result)
+      /* FALLTHRU */
+    case 8:
+      FMA_INT4_GENERAL(lhs[7], rhs[7], result)
+      /* FALLTHRU */
+    case 7:
+      FMA_INT4_GENERAL(lhs[6], rhs[6], result)
+      /* FALLTHRU */
+    case 6:
+      FMA_INT4_GENERAL(lhs[5], rhs[5], result)
+      /* FALLTHRU */
+    case 5:
+      FMA_INT4_GENERAL(lhs[4], rhs[4], result)
+      /* FALLTHRU */
+    case 4:
+      FMA_INT4_GENERAL(lhs[3], rhs[3], result)
+      /* FALLTHRU */
+    case 3:
+      FMA_INT4_GENERAL(lhs[2], rhs[2], result)
+      /* FALLTHRU */
+    case 2:
+      FMA_INT4_GENERAL(lhs[1], rhs[1], result)
+      /* FALLTHRU */
+    case 1:
+      FMA_INT4_GENERAL(lhs[0], rhs[0], result)
+  }
+
+  *distance = result;
+}
+
+}  // namespace zvec::turbo::sse::internal
+
+#endif  // defined(__SSE4_1__)
diff --git a/src/turbo/sse/record_quantized_int4/cosine.cc b/src/turbo/sse/record_quantized_int4/cosine.cc
new file mode 100644
index 000000000..5751e511d
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int4/cosine.cc
@@ -0,0 +1,70 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sse/record_quantized_int4/cosine.h"
+#include "sse/record_quantized_int4/common.h"
+#if defined(__SSE4_1__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::sse {
+
+void cosine_int4_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+#if defined(__SSE4_1__)
+  const int d = dim - 40;
+  const size_t original_dim = d >> 1;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_sse(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms +
+                static_cast<float>(d) * qb * mb);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __SSE4_1__
+}
+
+void cosine_int4_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__SSE4_1__)
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__SSE4_1__
+}
+
+}  // namespace zvec::turbo::sse
\ No newline at end of file
diff --git a/src/turbo/sse/record_quantized_int4/cosine.h b/src/turbo/sse/record_quantized_int4/cosine.h
new file mode 100644
index 000000000..87306a06e
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int4/cosine.h
@@ -0,0 +1,30 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::sse {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized INT4 vector pair.
+void cosine_int4_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_int4_distance.
+void cosine_int4_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::sse
\ No newline at end of file
diff --git a/src/turbo/sse/record_quantized_int4/inner_product.cc b/src/turbo/sse/record_quantized_int4/inner_product.cc
new file mode 100644
index 000000000..47121a668
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int4/inner_product.cc
@@ -0,0 +1,76 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sse/record_quantized_int4/inner_product.h"
+#include "sse/record_quantized_int4/common.h"
+
+#if defined(__SSE4_1__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::sse {
+
+// Compute squared inner product distance between a single quantized INT4
+// vector pair.
+void inner_product_int4_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__SSE4_1__)
+  const int d = dim - 32;
+  const size_t original_dim = d >> 1;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_sse(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance =
+      -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + d * qb * mb);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  //__SSE4_1__
+}
+
+// Batch version of inner_product_int4_distance.
+void inner_product_int4_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__SSE4_1__)
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__SSE4_1__
+}
+
+}  // namespace zvec::turbo::sse
\ No newline at end of file
diff --git a/src/turbo/sse/record_quantized_int4/inner_product.h b/src/turbo/sse/record_quantized_int4/inner_product.h
new file mode 100644
index 000000000..4ee508ed2
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int4/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::sse {
+
+// Compute inner product distance between a single quantized INT4
+// vector pair.
+void inner_product_int4_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_int4_distance.
+void inner_product_int4_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::sse
diff --git a/src/turbo/sse/record_quantized_int4/squared_euclidean.cc b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc
new file mode 100644
index 000000000..59155e2f3
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc
@@ -0,0 +1,78 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sse/record_quantized_int4/squared_euclidean.h"
+#include "sse/record_quantized_int4/common.h"
+
+#if defined(__SSE4_1__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::sse {
+
+void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__SSE4_1__)
+  const int d = dim - 32;
+  const size_t original_dim = d >> 1;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int4_sse(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+  float qs2 = a_tail[3];
+
+  const float sum = qa * qs;
+  const float sum2 = qa * qa * qs2;
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+  float ms2 = b_tail[3];
+
+  *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance +
+              (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __SSE4_1__
+}
+
+void squared_euclidean_int4_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__SSE4_1__)
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__SSE4_1__
+}
+
+}  // namespace zvec::turbo::sse
\ No newline at end of file
diff --git a/src/turbo/sse/record_quantized_int4/squared_euclidean.h b/src/turbo/sse/record_quantized_int4/squared_euclidean.h
new file mode 100644
index 000000000..3cff9f99b
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int4/squared_euclidean.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::sse {
+
+// Compute squared euclidean distance between a single quantized INT4
+// vector pair.
+void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared euclidean  INT4.
+void squared_euclidean_int4_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+}  // namespace zvec::turbo::sse
diff --git a/src/turbo/sse/record_quantized_int8/common.h b/src/turbo/sse/record_quantized_int8/common.h
new file mode 100644
index 000000000..b48b2598e
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int8/common.h
@@ -0,0 +1,210 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__SSE__)
+#include <immintrin.h>
+#include <array>
+#include <cstdint>
+#include <zvec/ailego/internal/platform.h>
+
+namespace zvec::turbo::sse::internal {
+
+#define ONES_INT16_SSE _mm_set1_epi32(0x00010001)
+
+static inline int32_t HorizontalAdd_INT32_V128(__m128i v) {
+#ifdef __SSE3__
+  __m128i x1 = _mm_hadd_epi32(v, v);
+  __m128i x2 = _mm_hadd_epi32(x1, x1);
+  return _mm_cvtsi128_si32(x2);
+#else
+  __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2));
+  __m128i x2 = _mm_add_epi32(v, x1);
+  __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1));
+  __m128i x4 = _mm_add_epi32(x2, x3);
+  return _mm_cvtsi128_si32(x4);
+#endif
+}
+
+//! Calculate Fused-Multiply-Add (GENERAL)
+#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast<float>(m * q);
+
+static __attribute__((always_inline)) void inner_product_int8_sse(
+    const void *a, const void *b, size_t size, float *distance) {
+  const int8_t *lhs = reinterpret_cast<const int8_t *>(a);
+  const int8_t *rhs = reinterpret_cast<const int8_t *>(b);
+
+  const int8_t *last = lhs + size;
+  const int8_t *last_aligned = lhs + ((size >> 5) << 5);
+
+  __m128i xmm_sum_0 = _mm_setzero_si128();
+  __m128i xmm_sum_1 = _mm_setzero_si128();
+
+  if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) {
+    for (; lhs != last_aligned; lhs += 32, rhs += 32) {
+      __m128i xmm_lhs_0 = _mm_load_si128((const __m128i *)(lhs + 0));
+      __m128i xmm_lhs_1 = _mm_load_si128((const __m128i *)(lhs + 16));
+      __m128i xmm_rhs_0 = _mm_load_si128((const __m128i *)(rhs + 0));
+      __m128i xmm_rhs_1 = _mm_load_si128((const __m128i *)(rhs + 16));
+
+      xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0);
+      xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1);
+      xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0);
+      xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1);
+      xmm_sum_0 =
+          _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0),
+                                       ONES_INT16_SSE),
+                        xmm_sum_0);
+      xmm_sum_1 =
+          _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1),
+                                       ONES_INT16_SSE),
+                        xmm_sum_1);
+    }
+
+    if (last >= last_aligned + 16) {
+      __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs);
+      __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs);
+
+      xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs);
+      xmm_rhs = _mm_abs_epi8(xmm_rhs);
+      xmm_sum_0 = _mm_add_epi32(
+          _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), ONES_INT16_SSE),
+          xmm_sum_0);
+      lhs += 16;
+      rhs += 16;
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 32, rhs += 32) {
+      __m128i xmm_lhs_0 = _mm_loadu_si128((const __m128i *)(lhs + 0));
+      __m128i xmm_lhs_1 = _mm_loadu_si128((const __m128i *)(lhs + 16));
+      __m128i xmm_rhs_0 = _mm_loadu_si128((const __m128i *)(rhs + 0));
+      __m128i xmm_rhs_1 = _mm_loadu_si128((const __m128i *)(rhs + 16));
+
+      xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0);
+      xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1);
+      xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0);
+      xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1);
+      xmm_sum_0 =
+          _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0),
+                                       ONES_INT16_SSE),
+                        xmm_sum_0);
+      xmm_sum_1 =
+          _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1),
+                                       ONES_INT16_SSE),
+                        xmm_sum_1);
+    }
+
+    if (last >= last_aligned + 16) {
+      __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs);
+      __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs);
+
+      xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs);
+      xmm_rhs = _mm_abs_epi8(xmm_rhs);
+      xmm_sum_0 = _mm_add_epi32(
+          _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), ONES_INT16_SSE),
+          xmm_sum_0);
+      lhs += 16;
+      rhs += 16;
+    }
+  }
+  float result = static_cast<float>(
+      HorizontalAdd_INT32_V128(_mm_add_epi32(xmm_sum_0, xmm_sum_1)));
+
+  switch (last - lhs) {
+    case 15:
+      FMA_INT8_GENERAL(lhs[14], rhs[14], result)
+      /* FALLTHRU */
+    case 14:
+      FMA_INT8_GENERAL(lhs[13], rhs[13], result)
+      /* FALLTHRU */
+    case 13:
+      FMA_INT8_GENERAL(lhs[12], rhs[12], result)
+      /* FALLTHRU */
+    case 12:
+      FMA_INT8_GENERAL(lhs[11], rhs[11], result)
+      /* FALLTHRU */
+    case 11:
+      FMA_INT8_GENERAL(lhs[10], rhs[10], result)
+      /* FALLTHRU */
+    case 10:
+      FMA_INT8_GENERAL(lhs[9], rhs[9], result)
+      /* FALLTHRU */
+    case 9:
+      FMA_INT8_GENERAL(lhs[8], rhs[8], result)
+      /* FALLTHRU */
+    case 8:
+      FMA_INT8_GENERAL(lhs[7], rhs[7], result)
+      /* FALLTHRU */
+    case 7:
+      FMA_INT8_GENERAL(lhs[6], rhs[6], result)
+      /* FALLTHRU */
+    case 6:
+      FMA_INT8_GENERAL(lhs[5], rhs[5], result)
+      /* FALLTHRU */
+    case 5:
+      FMA_INT8_GENERAL(lhs[4], rhs[4], result)
+      /* FALLTHRU */
+    case 4:
+      FMA_INT8_GENERAL(lhs[3], rhs[3], result)
+      /* FALLTHRU */
+    case 3:
+      FMA_INT8_GENERAL(lhs[2], rhs[2], result)
+      /* FALLTHRU */
+    case 2:
+      FMA_INT8_GENERAL(lhs[1], rhs[1], result)
+      /* FALLTHRU */
+    case 1:
+      FMA_INT8_GENERAL(lhs[0], rhs[0], result)
+  }
+
+  *distance = result;
+}
+
+template <size_t batch_size>
+__attribute__((always_inline)) void inner_product_int8_batch_sse_impl(
+    const void *query, const void *const *vectors,
+    const std::array<const void *, batch_size> &prefetch_ptrs,
+    size_t dimensionality, float *distances) {
+  // TBD
+}
+
+static __attribute__((always_inline)) void inner_product_int8_batch_sse(
+    const void *const *vectors, const void *query, size_t n, size_t dim,
+    float *distances) {
+  static constexpr size_t batch_size = 2;
+  static constexpr size_t prefetch_step = 2;
+  size_t i = 0;
+  for (; i + batch_size <= n; i += batch_size) {
+    std::array<const void *, batch_size> prefetch_ptrs;
+    for (size_t j = 0; j < batch_size; ++j) {
+      if (i + j + batch_size * prefetch_step < n) {
+        prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step];
+      } else {
+        prefetch_ptrs[j] = nullptr;
+      }
+    }
+    inner_product_int8_batch_sse_impl<batch_size>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+  for (; i < n; i++) {
+    std::array<const void *, 1> prefetch_ptrs{nullptr};
+    inner_product_int8_batch_sse_impl<1>(query, &vectors[i], prefetch_ptrs, dim,
+                                         distances + i);
+  }
+}
+
+}  // namespace zvec::turbo::sse::internal
+
+#endif  // defined(__SSE__)
diff --git a/src/turbo/sse/record_quantized_int8/cosine.cc b/src/turbo/sse/record_quantized_int8/cosine.cc
new file mode 100644
index 000000000..879cf9c99
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int8/cosine.cc
@@ -0,0 +1,70 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sse/record_quantized_int8/cosine.h"
+#include "sse/record_quantized_int8/common.h"
+
+#if defined(__SSE__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::sse {
+
+void cosine_int8_distance(const void *a, const void *b, size_t dim,
+                          float *distance) {
+#if defined(__SSE__)
+  const int original_dim = dim - 24;
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int8_sse(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms +
+                static_cast<float>(original_dim) * qb * mb);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  // __SSE__
+}
+
+void cosine_int8_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances) {
+#if defined(__SSE__)
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__SSE__
+}
+
+}  // namespace zvec::turbo::sse
\ No newline at end of file
diff --git a/src/turbo/sse/record_quantized_int8/cosine.h b/src/turbo/sse/record_quantized_int8/cosine.h
new file mode 100644
index 000000000..e0ac7f556
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int8/cosine.h
@@ -0,0 +1,34 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::sse {
+
+// Compute cosine distance (negative inner product after normalization) between
+// a single quantized INT8 vector pair.
+// `dim` includes the original vector bytes plus a 24-byte metadata tail
+// (3 floats: scale_a, bias_a, sum_a).
+void cosine_int8_distance(const void *a, const void *b, size_t dim,
+                          float *distance);
+
+// Batch version of cosine_int8_distance.
+// The query must have been preprocessed by cosine_int8_query_preprocess
+// (int8 -> uint8 via +128 shift) before calling this function.
+void cosine_int8_batch_distance(const void *const *vectors, const void *query,
+                                size_t n, size_t dim, float *distances);
+
+}  // namespace zvec::turbo::sse
\ No newline at end of file
diff --git a/src/turbo/sse/record_quantized_int8/inner_product.cc b/src/turbo/sse/record_quantized_int8/inner_product.cc
new file mode 100644
index 000000000..6b6c4d9c1
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int8/inner_product.cc
@@ -0,0 +1,75 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sse/record_quantized_int8/inner_product.h"
+#include "sse/record_quantized_int8/common.h"
+
+#if defined(__SSE__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::sse {
+
+// Compute squared Euclidean distance between a single quantized INT4
+// vector pair.
+void inner_product_int8_distance(const void *a, const void *b, size_t dim,
+                                 float *distance) {
+#if defined(__SSE__)
+  const size_t original_dim = dim - 20;
+
+  if (original_dim <= 0) {
+    return;
+  }
+
+  internal::inner_product_int8_sse(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const uint8_t *>(b) + original_dim);
+
+  float qa = a_tail[0];
+  float qb = a_tail[1];
+  float qs = a_tail[2];
+
+  float ma = b_tail[0];
+  float mb = b_tail[1];
+  float ms = b_tail[2];
+
+  *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms +
+                original_dim * qb * mb);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif  //__SSE__
+}
+
+// Batch version of inner_product_int8_distance.
+void inner_product_int8_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances) {
+#if defined(__SSE__)
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif  //__SSE__
+}
+
+}  // namespace zvec::turbo::sse
\ No newline at end of file
diff --git a/src/turbo/sse/record_quantized_int8/inner_product.h b/src/turbo/sse/record_quantized_int8/inner_product.h
new file mode 100644
index 000000000..9c6314b35
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int8/inner_product.h
@@ -0,0 +1,31 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::sse {
+
+// Compute inner product distance between a single quantized INT4
+// vector pair.
+void inner_product_int8_distance(const void *a, const void *b, size_t dim,
+                                 float *distance);
+
+// Batch version of inner_product_int4_distance.
+void inner_product_int8_batch_distance(const void *const *vectors,
+                                       const void *query, size_t n, size_t dim,
+                                       float *distances);
+
+}  // namespace zvec::turbo::sse
\ No newline at end of file
diff --git a/src/turbo/sse/record_quantized_int8/squared_euclidean.cc b/src/turbo/sse/record_quantized_int8/squared_euclidean.cc
new file mode 100644
index 000000000..3fb001204
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int8/squared_euclidean.cc
@@ -0,0 +1,75 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sse/record_quantized_int8/squared_euclidean.h"
+#include "sse/record_quantized_int8/common.h"
+#if defined(__SSE__)
+#include <immintrin.h>
+#endif
+
+namespace zvec::turbo::sse {
+
+void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim,
+                                     float *distance) {
+#if defined(__SSE__)
+  const int original_dim = dim - 20;
+  if (original_dim <= 0) {
+    return;
+  }
+  internal::inner_product_int8_sse(a, b, original_dim, distance);
+
+  const float *a_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(a) + original_dim);
+  const float *b_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(b) + original_dim);
+
+  float ma = a_tail[0];
+  float mb = a_tail[1];
+  float ms = a_tail[2];
+  float ms2 = a_tail[3];
+
+  float qa = b_tail[0];
+  float qb = b_tail[1];
+  float qs = b_tail[2];
+  float qs2 = b_tail[3];
+
+  const float sum = qa * qs;
+  const float sum2 = qa * qa * qs2;
+
+  *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance +
+              (mb - qb) * (mb - qb) * original_dim +
+              2 * (mb - qb) * (ms * ma - sum);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif
+}
+
+void squared_euclidean_int8_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances) {
+#if defined(__SSE__)
+
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif
+}
+
+}  // namespace zvec::turbo::sse
diff --git a/src/turbo/sse/record_quantized_int8/squared_euclidean.h b/src/turbo/sse/record_quantized_int8/squared_euclidean.h
new file mode 100644
index 000000000..1e2cf45b4
--- /dev/null
+++ b/src/turbo/sse/record_quantized_int8/squared_euclidean.h
@@ -0,0 +1,41 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace zvec::turbo::sse {
+
+// Compute squared Euclidean distance between a single quantized INT8
+// vector pair.
+// `dim` includes the original vector bytes plus a 20-byte metadata tail
+// (4 floats: scale_a, bias_a, sum_a, sum2_a).
+void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim,
+                                     float *distance);
+
+// Batch version of squared_euclidean_int8_distance.
+// The query must have been preprocessed by
+// squared_euclidean_int8_query_preprocess (int8 -> uint8 via +128 shift)
+// before calling this function.
+void squared_euclidean_int8_batch_distance(const void *const *vectors,
+                                           const void *query, size_t n,
+                                           size_t dim, float *distances);
+
+// Preprocess the query vector in-place (shift int8 -> uint8 by adding 128)
+// for the batch path. Only the original_dim bytes are shifted; the metadata
+// tail is left intact. `dim` includes the 20-byte metadata tail.
+void squared_euclidean_int8_query_preprocess(void *query, size_t dim);
+
+}  // namespace zvec::turbo::sse
diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc
index a731cfed1..1fb5dcd7e 100644
--- a/src/turbo/turbo.cc
+++ b/src/turbo/turbo.cc
@@ -14,34 +14,344 @@
 
 #include <ailego/internal/cpu_features.h>
 #include <zvec/turbo/turbo.h>
+#include "armv8/float32/cosine.h"
+#include "armv8/float32/inner_product.h"
+#include "armv8/float32/squared_euclidean.h"
+#include "armv8/half_float/cosine.h"
+#include "armv8/half_float/inner_product.h"
+#include "armv8/half_float/squared_euclidean.h"
+#include "avx/float32/cosine.h"
+#include "avx/float32/inner_product.h"
+#include "avx/float32/squared_euclidean.h"
+#include "avx/half_float/cosine.h"
+#include "avx/half_float/inner_product.h"
+#include "avx/half_float/squared_euclidean.h"
+#include "avx2/record_quantized_int4/cosine.h"
+#include "avx2/record_quantized_int4/inner_product.h"
+#include "avx2/record_quantized_int4/squared_euclidean.h"
+#include "avx2/record_quantized_int8/cosine.h"
+#include "avx2/record_quantized_int8/inner_product.h"
+#include "avx2/record_quantized_int8/squared_euclidean.h"
+#include "avx512/float32/cosine.h"
+#include "avx512/float32/inner_product.h"
+#include "avx512/float32/squared_euclidean.h"
+#include "avx512/half_float/cosine.h"
+#include "avx512/half_float/inner_product.h"
+#include "avx512/half_float/squared_euclidean.h"
+#include "avx512_fp16/half_float/cosine.h"
+#include "avx512_fp16/half_float/inner_product.h"
+#include "avx512_fp16/half_float/squared_euclidean.h"
 #include "avx512_vnni/record_quantized_int8/cosine.h"
+#include "avx512_vnni/record_quantized_int8/inner_product.h"
 #include "avx512_vnni/record_quantized_int8/squared_euclidean.h"
+#include "scalar/float32/cosine.h"
+#include "scalar/float32/inner_product.h"
+#include "scalar/float32/squared_euclidean.h"
+#include "scalar/half_float/cosine.h"
+#include "scalar/half_float/inner_product.h"
+#include "scalar/half_float/squared_euclidean.h"
+#include "scalar/record_quantized_int4/cosine.h"
+#include "scalar/record_quantized_int4/inner_product.h"
+#include "scalar/record_quantized_int4/squared_euclidean.h"
+#include "scalar/record_quantized_int8/cosine.h"
+#include "scalar/record_quantized_int8/inner_product.h"
+#include "scalar/record_quantized_int8/squared_euclidean.h"
+#include "sse/record_quantized_int4/cosine.h"
+#include "sse/record_quantized_int4/inner_product.h"
+#include "sse/record_quantized_int4/squared_euclidean.h"
+#include "sse/record_quantized_int8/cosine.h"
+#include "sse/record_quantized_int8/inner_product.h"
+#include "sse/record_quantized_int8/squared_euclidean.h"
 
 namespace zvec::turbo {
 
 DistanceFunc get_distance_func(MetricType metric_type, DataType data_type,
-                               QuantizeType quantize_type) {
+                               QuantizeType quantize_type,
+                               CpuArchType cpu_arch_type) {
+#if defined(__ARM_NEON)
+  // INT8
   if (data_type == DataType::kInt8) {
     if (quantize_type == QuantizeType::kDefault) {
-      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) {
+      if (metric_type == MetricType::kSquaredEuclidean) {
+        return scalar::squared_euclidean_int8_distance;
+      }
+
+      if (metric_type == MetricType::kCosine) {
+        return scalar::cosine_int8_distance;
+      }
+
+      if (metric_type == MetricType::kInnerProduct) {
+        return scalar::inner_product_int8_distance;
+      }
+    }
+  }
+
+  // INT$
+  if (data_type == DataType::kInt4) {
+    if (quantize_type == QuantizeType::kDefault) {
+      if (metric_type == MetricType::kSquaredEuclidean) {
+        return scalar::squared_euclidean_int4_distance;
+      }
+
+      if (metric_type == MetricType::kCosine) {
+        return scalar::cosine_int4_distance;
+      }
+
+      if (metric_type == MetricType::kInnerProduct) {
+        return scalar::inner_product_int4_distance;
+      }
+    }
+  }
+
+  // FP32
+  if (data_type == DataType::kFp32) {
+    if (quantize_type == QuantizeType::kDefault) {
+      if (metric_type == MetricType::kSquaredEuclidean) {
+        return armv8::squared_euclidean_fp32_distance;
+      }
+
+      if (metric_type == MetricType::kCosine) {
+        return armv8::cosine_fp32_distance;
+      }
+
+      if (metric_type == MetricType::kInnerProduct) {
+        return armv8::inner_product_fp32_distance;
+      }
+    }
+  }
+
+  // FP16
+  if (data_type == DataType::kFp16) {
+    if (quantize_type == QuantizeType::kDefault) {
+      if (metric_type == MetricType::kSquaredEuclidean) {
+        return armv8::squared_euclidean_fp16_distance;
+      }
+
+      if (metric_type == MetricType::kCosine) {
+        return armv8::cosine_fp16_distance;
+      }
+
+      if (metric_type == MetricType::kInnerProduct) {
+        return armv8::inner_product_fp16_distance;
+      }
+    }
+  }
+#else
+  // INT8
+  if (data_type == DataType::kInt8) {
+    if (quantize_type == QuantizeType::kDefault) {
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX512VNNI)) {
         if (metric_type == MetricType::kSquaredEuclidean) {
           return avx512_vnni::squared_euclidean_int8_distance;
         }
         if (metric_type == MetricType::kCosine) {
           return avx512_vnni::cosine_int8_distance;
         }
+
+        if (metric_type == MetricType::kInnerProduct) {
+          return avx512_vnni::inner_product_int8_distance;
+        }
+      }
+
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX2)) {
+        if (metric_type == MetricType::kSquaredEuclidean) {
+          return avx2::squared_euclidean_int8_distance;
+        }
+        if (metric_type == MetricType::kCosine) {
+          return avx2::cosine_int8_distance;
+        }
+
+        if (metric_type == MetricType::kInnerProduct) {
+          return avx2::inner_product_int8_distance;
+        }
+      }
+
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kSSE)) {
+        if (metric_type == MetricType::kSquaredEuclidean) {
+          return sse::squared_euclidean_int8_distance;
+        }
+        if (metric_type == MetricType::kCosine) {
+          return sse::cosine_int8_distance;
+        }
+
+        if (metric_type == MetricType::kInnerProduct) {
+          return sse::inner_product_int8_distance;
+        }
+      }
+
+      if (metric_type == MetricType::kSquaredEuclidean) {
+        return scalar::squared_euclidean_int8_distance;
+      }
+      if (metric_type == MetricType::kCosine) {
+        return scalar::cosine_int8_distance;
+      }
+
+      if (metric_type == MetricType::kInnerProduct) {
+        return scalar::inner_product_int8_distance;
       }
     }
   }
+
+  // INT4
+  if (data_type == DataType::kInt4) {
+    if (quantize_type == QuantizeType::kDefault) {
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX2)) {
+        if (metric_type == MetricType::kSquaredEuclidean) {
+          return avx2::squared_euclidean_int4_distance;
+        }
+        if (metric_type == MetricType::kCosine) {
+          return avx2::cosine_int4_distance;
+        }
+        if (metric_type == MetricType::kInnerProduct) {
+          return avx2::inner_product_int4_distance;
+        }
+      }
+
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kSSE)) {
+        if (metric_type == MetricType::kSquaredEuclidean) {
+          return sse::squared_euclidean_int4_distance;
+        }
+        if (metric_type == MetricType::kCosine) {
+          return sse::cosine_int4_distance;
+        }
+        if (metric_type == MetricType::kInnerProduct) {
+          return sse::inner_product_int4_distance;
+        }
+      }
+
+      if (metric_type == MetricType::kSquaredEuclidean) {
+        return scalar::squared_euclidean_int4_distance;
+      } else if (metric_type == MetricType::kCosine) {
+        return scalar::cosine_int4_distance;
+      } else if (metric_type == MetricType::kInnerProduct) {
+        return scalar::inner_product_int4_distance;
+      }
+    }
+  }
+
+  // FP32
+  if (data_type == DataType::kFp32) {
+    if (quantize_type == QuantizeType::kDefault) {
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512F &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX512)) {
+        if (metric_type == MetricType::kSquaredEuclidean) {
+          return avx512::squared_euclidean_fp32_distance;
+        }
+        if (metric_type == MetricType::kCosine) {
+          return avx512::cosine_fp32_distance;
+        }
+        if (metric_type == MetricType::kInnerProduct) {
+          return avx512::inner_product_fp32_distance;
+        }
+      }
+
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX)) {
+        if (metric_type == MetricType::kSquaredEuclidean) {
+          return avx::squared_euclidean_fp32_distance;
+        }
+        if (metric_type == MetricType::kCosine) {
+          return avx::cosine_fp32_distance;
+        }
+        if (metric_type == MetricType::kInnerProduct) {
+          return avx::inner_product_fp32_distance;
+        }
+      }
+
+      if (metric_type == MetricType::kSquaredEuclidean) {
+        return scalar::squared_euclidean_fp32_distance;
+      }
+      if (metric_type == MetricType::kCosine) {
+        return scalar::cosine_fp32_distance;
+      }
+      if (metric_type == MetricType::kInnerProduct) {
+        return scalar::inner_product_fp32_distance;
+      }
+    }
+  }
+
+  // FP16
+  if (data_type == DataType::kFp16) {
+    if (quantize_type == QuantizeType::kDefault) {
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_FP16 &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX512FP16)) {
+        if (metric_type == MetricType::kInnerProduct) {
+          return avx512_fp16::inner_product_fp16_distance;
+        }
+        if (metric_type == MetricType::kCosine) {
+          return avx512_fp16::cosine_fp16_distance;
+        }
+        if (metric_type == MetricType::kSquaredEuclidean) {
+          return avx512_fp16::squared_euclidean_fp16_distance;
+        }
+      }
+
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512F &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX512)) {
+        if (metric_type == MetricType::kSquaredEuclidean) {
+          return avx512::squared_euclidean_fp16_distance;
+        }
+        if (metric_type == MetricType::kCosine) {
+          return avx512::cosine_fp16_distance;
+        }
+        if (metric_type == MetricType::kInnerProduct) {
+          return avx512::inner_product_fp16_distance;
+        }
+      }
+
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX)) {
+        if (metric_type == MetricType::kSquaredEuclidean) {
+          return avx::squared_euclidean_fp16_distance;
+        }
+        if (metric_type == MetricType::kCosine) {
+          return avx::cosine_fp16_distance;
+        }
+        if (metric_type == MetricType::kInnerProduct) {
+          return avx::inner_product_fp16_distance;
+        }
+      }
+
+      if (metric_type == MetricType::kSquaredEuclidean) {
+        return scalar::squared_euclidean_fp16_distance;
+      }
+      if (metric_type == MetricType::kCosine) {
+        return scalar::cosine_fp16_distance;
+      }
+      if (metric_type == MetricType::kInnerProduct) {
+        return scalar::inner_product_fp16_distance;
+      }
+    }
+  }
+#endif
+
   return nullptr;
 }
 
 BatchDistanceFunc get_batch_distance_func(MetricType metric_type,
                                           DataType data_type,
-                                          QuantizeType quantize_type) {
+                                          QuantizeType quantize_type,
+                                          CpuArchType cpu_arch_type) {
   if (data_type == DataType::kInt8) {
     if (quantize_type == QuantizeType::kDefault) {
-      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) {
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX512VNNI)) {
         if (metric_type == MetricType::kSquaredEuclidean) {
           return avx512_vnni::squared_euclidean_int8_batch_distance;
         }
@@ -51,15 +361,37 @@ BatchDistanceFunc get_batch_distance_func(MetricType metric_type,
       }
     }
   }
+
+  if (data_type == DataType::kInt4) {
+    if (quantize_type == QuantizeType::kDefault) {
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX2)) {
+        if (metric_type == MetricType::kSquaredEuclidean) {
+          return avx2::squared_euclidean_int4_batch_distance;
+        }
+        if (metric_type == MetricType::kCosine) {
+          return avx2::cosine_int4_batch_distance;
+        }
+        if (metric_type == MetricType::kInnerProduct) {
+          return avx2::inner_product_int4_batch_distance;
+        }
+      }
+    }
+  }
+
   return nullptr;
 }
 
 QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type,
                                               DataType data_type,
-                                              QuantizeType quantize_type) {
+                                              QuantizeType quantize_type,
+                                              CpuArchType cpu_arch_type) {
   if (data_type == DataType::kInt8) {
     if (quantize_type == QuantizeType::kDefault) {
-      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) {
+      if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI &&
+          (cpu_arch_type == CpuArchType::kAuto ||
+           cpu_arch_type == CpuArchType::kAVX512VNNI)) {
         if (metric_type == MetricType::kSquaredEuclidean) {
           return avx512_vnni::squared_euclidean_int8_query_preprocess;
         }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7be2294dd..e3b54ee24 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -7,6 +7,7 @@ include_directories(${PROJECT_ROOT_DIR})
 cc_directories(ailego)
 cc_directories(db)
 cc_directories(core)
+cc_directories(turbo)
 if(BUILD_C_BINDINGS)
     cc_directories(c)
 endif()
diff --git a/tests/core/algorithm/hnsw/hnsw_streamer_test.cc b/tests/core/algorithm/hnsw/hnsw_streamer_test.cc
index 3f27f5252..1ee7ef6d1 100644
--- a/tests/core/algorithm/hnsw/hnsw_streamer_test.cc
+++ b/tests/core/algorithm/hnsw/hnsw_streamer_test.cc
@@ -3471,93 +3471,6 @@ TEST_F(HnswStreamerTest, TestGroupInBruteforceSearch) {
   }
 }
 
-#if 0
-TEST_F(HnswStreamerTest, TestBinaryConverter) {
-  uint32_t dimension = 2560;
-
-  IndexStreamer::Pointer streamer =
-      IndexFactory::CreateStreamer("HnswStreamer");
-  ASSERT_TRUE(streamer != nullptr);
-
-  ailego::Params params;
-  // params.set(PARAM_HNSW_STREAMER_MAX_NEIGHBOR_COUNT, 10);
-  // params.set(PARAM_HNSW_STREAMER_SCALING_FACTOR, 16);
-  // params.set(PARAM_HNSW_STREAMER_EFCONSTRUCTION, 10);
-  // params.set(PARAM_HNSW_STREAMER_EF, 5);
-  params.set(PARAM_HNSW_STREAMER_BRUTE_FORCE_THRESHOLD, 1000U);
-
-  ailego::Params stg_params;
-
-  IndexMeta index_meta_raw(IndexMeta::DataType::DT_FP32, dimension);
-  index_meta_raw.set_metric("InnerProduct", 0, ailego::Params());
-
-  ailego::Params converter_params;
-  auto converter = IndexFactory::CreateConverter("BinaryConverter");
-  ASSERT_TRUE(converter != nullptr);
-
-  converter->init(index_meta_raw, converter_params);
-
-  IndexMeta index_meta = converter->meta();
-
-  auto reformer = IndexFactory::CreateReformer(index_meta.reformer_name());
-  ASSERT_TRUE(reformer != nullptr);
-
-  ASSERT_EQ(0, reformer->init(index_meta.reformer_params()));
-
-  auto storage = IndexFactory::CreateStorage("MMapFileStorage");
-  ASSERT_EQ(0, storage->init(stg_params));
-  ASSERT_EQ(0, storage->open(dir_ + "TestBinaryConverter.index", true));
-  ASSERT_EQ(0, streamer->init(index_meta, params));
-  ASSERT_EQ(0, streamer->open(storage));
-
-  size_t cnt = 5000U;
-  auto ctx = streamer->create_context();
-  ASSERT_TRUE(!!ctx);
-
-  IndexQueryMeta qmeta(IndexMeta::DataType::DT_FP32, dimension);
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-
-  std::uniform_real_distribution<float> dist(-2.0, 2.0);
-  std::vector<NumericalVector<float>> vecs;
-
-  for (size_t i = 0; i < cnt; i++) {
-    NumericalVector<float> vec(dimension);
-    for (size_t j = 0; j < dimension; ++j) {
-      vec[j] = dist(gen);
-    }
-
-    std::string new_vec;
-    IndexQueryMeta new_meta;
-
-    ASSERT_EQ(0, reformer->convert(vec.data(), qmeta, &new_vec, &new_meta));
-    ASSERT_EQ(0, streamer->add_impl(i, new_vec.data(), new_meta, ctx));
-
-    vecs.push_back(vec);
-  }
-
-  size_t query_cnt = 200U;
-  auto knnCtx = streamer->create_context();
-
-  float epison = 1e-6;
-  for (size_t i = 0; i < query_cnt; i++) {
-    auto &vec = vecs[i];
-    std::string new_query;
-    IndexQueryMeta new_meta;
-    ASSERT_EQ(0, reformer->transform(vec.data(), qmeta, &new_query, &new_meta));
-
-    size_t topk = 50;
-    knnCtx->set_topk(topk);
-    ASSERT_EQ(0, streamer->search_impl(new_query.data(), new_meta, knnCtx));
-    auto &results = knnCtx->result();
-    ASSERT_EQ(topk, results.size());
-    ASSERT_EQ(i, results[0].key());
-    ASSERT_NEAR(0, results[0].score(), epison);
-  }
-}
-#endif
-
 TEST_F(HnswStreamerTest, TestAddAndSearchWithID) {
   IndexStreamer::Pointer streamer =
       IndexFactory::CreateStreamer("HnswStreamer");
@@ -3671,131 +3584,134 @@ TEST_F(HnswStreamerTest, TestAddAndSearchWithID) {
   // EXPECT_GT(cost, 2.0f);
 }
 
-#if 0
-TEST_F(HnswStreamerTest, TestBasicRefiner) {
-  uint32_t dimension = 1120;
-
-  IndexStreamer::Pointer base_streamer =
+TEST_F(HnswStreamerTest, TestTurboCosineInt8Quantizer) {
+  IndexStreamer::Pointer streamer =
       IndexFactory::CreateStreamer("HnswStreamer");
-  ASSERT_TRUE(base_streamer != nullptr);
+  ASSERT_TRUE(streamer != nullptr);
 
-  IndexStreamer::Pointer refine_streamer =
-      IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_TRUE(refine_streamer != nullptr);
+  ailego::Params params;
+  params.set(PARAM_HNSW_STREAMER_MAX_NEIGHBOR_COUNT, 50);
+  params.set(PARAM_HNSW_STREAMER_SCALING_FACTOR, 16);
+  params.set(PARAM_HNSW_STREAMER_EFCONSTRUCTION, 100);
+  params.set(PARAM_HNSW_STREAMER_EF, 100);
+  params.set(PARAM_HNSW_STREAMER_BRUTE_FORCE_THRESHOLD, 1000U);
+  params.set(PARAM_HNSW_STREAMER_GET_VECTOR_ENABLE, true);
 
-  IndexRefiner::Pointer refiner = IndexFactory::CreateRefiner("BasicRefiner");
-  ASSERT_TRUE(refiner != nullptr);
+  ailego::Params stg_params;
 
-  ailego::Params params;
-  IndexMeta index_meta(IndexMeta::DataType::DT_FP32, dimension);
-  index_meta.set_metric("InnerProduct", 0, ailego::Params());
+  IndexMeta index_meta_raw(IndexMeta::DataType::DT_FP32, dim);
+  index_meta_raw.set_metric("Cosine", 0, ailego::Params());
 
   ailego::Params converter_params;
-  auto converter = IndexFactory::CreateConverter("BinaryConverter");
-  ASSERT_TRUE(converter != nullptr);
+  auto quantizer = IndexFactory::CreateQuantier("Int8Quantizer");
+  ASSERT_TRUE(quantizer != nullptr);
 
-  converter->init(index_meta, converter_params);
+  quantizer->init(index_meta_raw, quantizer_params);
 
-  IndexMeta index_meta_binary = converter->meta();
+  IndexMeta index_meta = quantizer->meta();
 
-  auto reformer =
-      IndexFactory::CreateReformer(index_meta_binary.reformer_name());
-  ASSERT_TRUE(reformer != nullptr);
+  auto storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0,
+            storage->open(dir_ + "TestTurboCosineInt8Quantizer.index", true));
+  ASSERT_EQ(0, streamer->init(index_meta, params));
+  ASSERT_EQ(0, streamer->open(storage));
 
-  ASSERT_EQ(0, reformer->init(index_meta_binary.reformer_params()));
+  NumericalVector<float> vec(dim);
+  size_t cnt = 2000U;
+  auto ctx = streamer->create_context();
+  ASSERT_TRUE(!!ctx);
 
-  // base streamer
-  ailego::Params base_stg_params;
-  auto base_storage = IndexFactory::CreateStorage("MMapFileStorage");
-  ASSERT_EQ(0, base_storage->init(base_stg_params));
-  ASSERT_EQ(0, base_storage->open(dir_ + "TestBasicRefinerBase.index", true));
-  ASSERT_EQ(0, base_streamer->init(index_meta_binary, params));
-  ASSERT_EQ(0, base_streamer->open(base_storage));
+  IndexQueryMeta qmeta(IndexMeta::DataType::DT_FP32, dim);
+  IndexQueryMeta new_meta;
 
-  auto base_ctx = base_streamer->create_context();
-  ASSERT_TRUE(!!base_ctx);
+  const float epsilon = 1e-2;
+  float fixed_value = float(cnt) / 2;
+  for (size_t i = 0; i < cnt; i++) {
+    float add_on = i * 10;
+    for (size_t j = 0; j < dim; ++j) {
+      if (j < dim / 4)
+        vec[j] = fixed_value;
+      else
+        vec[j] = fixed_value + add_on;
+    }
 
-  // refine streamer
-  ailego::Params refine_stg_params;
-  auto refine_storage = IndexFactory::CreateStorage("MMapFileStorage");
-  ASSERT_EQ(0, refine_storage->init(refine_stg_params));
-  ASSERT_EQ(0,
-            refine_storage->open(dir_ + "TestBasicRefinerRefine.index", true));
-  ASSERT_EQ(0, refine_streamer->init(index_meta, params));
-  ASSERT_EQ(0, refine_streamer->open(refine_storage));
-  auto refine_ctx = refine_streamer->create_context();
-  ASSERT_TRUE(!!refine_ctx);
+    std::string new_vec;
 
-  ailego::Params refiner_params;
-  ASSERT_EQ(0, refiner->init(base_streamer, refine_streamer, refiner_params));
+    ASSERT_EQ(0, quantizer->convert(vec.data(), qmeta, &new_vec, &new_meta));
+    ASSERT_EQ(0, streamer->add_impl(i, new_vec.data(), new_meta, ctx));
+  }
 
-  auto ctx = refiner->create_context();
-  ASSERT_TRUE(!!ctx);
+  for (size_t i = 0; i < cnt; i++) {
+    float add_on = i * 10;
 
-  IndexQueryMeta qmeta(IndexMeta::DataType::DT_FP32, dimension);
+    const void *vector = streamer->get_vector(i);
+    ASSERT_NE(vector, nullptr);
 
-  std::random_device rd;
-  std::mt19937 gen(rd());
+    std::string denormalized_vec;
+    denormalized_vec.resize(dim * sizeof(float));
+    quantizer->revert(vector, new_meta, &denormalized_vec);
 
-  std::uniform_real_distribution<float> dist(-2.0, 2.0);
-  std::vector<NumericalVector<float>> vecs;
+    float vector_value = *((float *)(denormalized_vec.data()) + dim - 1);
+    EXPECT_NEAR(vector_value, fixed_value + add_on, epsilon);
+  }
 
-  size_t cnt = 5000U;
-  for (size_t i = 0; i < cnt; i++) {
-    NumericalVector<float> vec(dimension);
-    for (size_t j = 0; j < dimension; ++j) {
-      vec[j] = dist(gen);
+  auto linearCtx = streamer->create_context();
+  linearCtx->set_fetch_vector(true);
+  auto knnCtx = streamer->create_context();
+  knnCtx->set_fetch_vector(true);
+
+  size_t query_cnt = 200U;
+  size_t topk = 200;
+  linearCtx->set_topk(topk);
+  knnCtx->set_topk(topk);
+  uint64_t knnTotalTime = 0;
+  uint64_t linearTotalTime = 0;
+  for (size_t i = 0; i < query_cnt; i++) {
+    float add_on = i * 10;
+    for (size_t j = 0; j < dim; ++j) {
+      if (j < dim / 4)
+        vec[j] = fixed_value;
+      else
+        vec[j] = fixed_value + add_on;
     }
 
-    std::string binary_vec;
-    IndexQueryMeta binary_qmeta;
+    std::string new_query;
+    IndexQueryMeta new_meta;
+    ASSERT_EQ(0, quantizer->quantize(vec.data(), qmeta, &new_query, &new_meta));
 
+    auto t1 = ailego::Realtime::MicroSeconds();
+    ASSERT_EQ(0, streamer->search_impl(new_query.data(), new_meta, knnCtx));
+    auto t2 = ailego::Realtime::MicroSeconds();
     ASSERT_EQ(0,
-              reformer->convert(vec.data(), qmeta, &binary_vec, &binary_qmeta));
-    ASSERT_EQ(0, refiner->add_impl(i, binary_vec.data(), binary_qmeta,
-                                   vec.data(), qmeta, ctx));
-
-    vecs.push_back(vec);
-  }
+              streamer->search_bf_impl(new_query.data(), new_meta, linearCtx));
+    auto t3 = ailego::Realtime::MicroSeconds();
 
-  size_t query_cnt = 200U;
-  // size_t query_cnt = 1U;
+    knnTotalTime += t2 - t1;
+    linearTotalTime += t3 - t2;
 
-  auto searcherCtx = refiner->create_context();
+    auto &knnResult = knnCtx->result();
+    ASSERT_EQ(topk, knnResult.size());
 
-  for (size_t i = 0; i < query_cnt; i++) {
-    auto &vec = vecs[i];
+    auto &linearResult = linearCtx->result();
+    ASSERT_EQ(topk, linearResult.size());
+    ASSERT_EQ(i, linearResult[0].key());
 
-    // float abs_value{0};
-    // for (size_t j = 0; j < dimension; ++j) {
-    //   std::cout << "dim: " << j << ", value: " << vec[j] << std::endl;
+    ASSERT_NE(knnResult[0].vector(), nullptr);
+    ASSERT_NE(linearResult[0].vector(), nullptr);
 
-    //   abs_value += std::abs(vec[j]);
-    // }
-    // std::cout << "abs value: " << abs_value << std::endl;
+    std::string denormalized_vec;
+    denormalized_vec.resize(dim * sizeof(float));
+    quantizer->dequantize(linearResult[0].vector(), new_meta,
+                          &denormalized_vec);
 
-    std::string new_query;
-    IndexQueryMeta binary_qmeta;
-    ASSERT_EQ(
-        0, reformer->transform(vec.data(), qmeta, &new_query, &binary_qmeta));
-
-    size_t topk = 50;
-    searcherCtx->set_topk(topk);
-    ASSERT_EQ(0, refiner->search_impl(new_query.data(), binary_qmeta,
-                                      vec.data(), qmeta, searcherCtx));
-    auto &results = searcherCtx->result();
-    ASSERT_EQ(topk, results.size());
-    ASSERT_EQ(i, results[0].key());
-
-    // for (size_t i = 0; i < results.size(); ++i) {
-    //   std::cout << i << ", id: " << results[i].index()
-    //             << ", score: " << results[i].score() << std::endl;
-    // }
+    float vector_value = *(((float *)(denormalized_vec.data()) + dim - 1));
+    EXPECT_NEAR(vector_value, fixed_value + add_on, epsilon);
   }
-}
-
-#endif
 
+  std::cout << "knnTotalTime: " << knnTotalTime << std::endl;
+  std::cout << "linearTotalTime: " << linearTotalTime << std::endl;
+}
 }  // namespace core
 }  // namespace zvec
 
diff --git a/tests/core/metric/quantized_integer_metric_test.cc b/tests/core/metric/quantized_integer_metric_test.cc
index 835a07fb7..f56d6ef67 100644
--- a/tests/core/metric/quantized_integer_metric_test.cc
+++ b/tests/core/metric/quantized_integer_metric_test.cc
@@ -32,8 +32,7 @@ using namespace zvec::ailego;
 
 static IndexHolder::Pointer GetHolder(
     size_t dim, size_t count, std::uniform_real_distribution<float> &dist) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   auto holder = std::make_shared<MultiPassIndexHolder<IndexMeta::DT_FP32>>(dim);
   for (size_t i = 0; i < count; ++i) {
     ailego::NumericalVector<float> vec(dim);
@@ -71,8 +70,7 @@ TEST(QuantizedIntegerMetric, General) {
 
   Params params;
 
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 1.0);
   const size_t DIMENSION = 21;
   ailego::NumericalVector<float> x(DIMENSION);
@@ -141,8 +139,7 @@ TEST(QuantizedIntegerMetric, General) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
 
   const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
@@ -202,8 +199,7 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt8SquaredEuclideanReformer) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
   std::uniform_int_distribution<int> dist2(0, 1);
 
@@ -344,8 +340,7 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclideanMetric) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
 
   const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen) * 2;
@@ -404,8 +399,7 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt4SquaredEuclideanReformer) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
   std::uniform_int_distribution<int> dist2(0, 1);
 
@@ -546,8 +540,7 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclideanMetric) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt8InnerProduct) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
 
   const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
@@ -631,8 +624,7 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProductMetric) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt4InnerProduct) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
 
   const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen) * 2;
@@ -716,8 +708,7 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProductMetric) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt8MipsSquaredEuclidean) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
 
   const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
@@ -805,8 +796,7 @@ TEST(QuantizedIntegerMetric, TestInt8MipsSquaredEuclideanMetric) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt4MipsSquaredEuclidean) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
 
   const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen) * 2;
@@ -890,8 +880,7 @@ TEST(QuantizedIntegerMetric, TestInt4MipsSquaredEuclideanMetric) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt8NormalizedCosine) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
 
   const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
@@ -990,8 +979,7 @@ TEST(QuantizedIntegerMetric, TestInt8NormalizedCosineMetric) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt8Cosine) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
 
   const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
@@ -1071,8 +1059,7 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) {
 }
 
 TEST(QuantizedIntegerMetric, TestInt4NormalizedCosine) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::mt19937 gen(15583);
   std::uniform_real_distribution<float> dist(-1.0, 2.0);
 
   const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen) * 2;
diff --git a/tests/turbo/CMakeLists.txt b/tests/turbo/CMakeLists.txt
new file mode 100644
index 000000000..0e864858a
--- /dev/null
+++ b/tests/turbo/CMakeLists.txt
@@ -0,0 +1,14 @@
+include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake)
+
+file(GLOB_RECURSE ALL_TEST_SRCS *_test.cc)
+
+foreach(CC_SRCS ${ALL_TEST_SRCS})
+  get_filename_component(CC_TARGET ${CC_SRCS} NAME_WE)
+  cc_gtest(
+      NAME ${CC_TARGET}
+      STRICT
+      LIBS zvec_ailego core_framework core_metric core_quantizer
+      SRCS ${CC_SRCS}
+      INCS . ${PROJECT_ROOT_DIR}/src/core/
+    )
+endforeach()
\ No newline at end of file
diff --git a/tests/turbo/turbo_cosine_test.cc b/tests/turbo/turbo_cosine_test.cc
new file mode 100644
index 000000000..ece33613d
--- /dev/null
+++ b/tests/turbo/turbo_cosine_test.cc
@@ -0,0 +1,366 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <gtest/gtest.h>
+#include <zvec/ailego/container/params.h>
+#include <zvec/turbo/turbo.h>
+#include "zvec/core/framework/index_factory.h"
+
+using namespace zvec;
+using namespace zvec::core;
+using namespace zvec::ailego;
+
+// Target Test Type: avx, avx512, scalar
+TEST(CosineMetric, TestFp32Cosine) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+
+  auto converter = IndexFactory::CreateConverter("CosineFp32Converter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("Cosine", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+
+  auto func_avx512 = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto func_avx = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    float score_scalar{0.0f};
+    float score_avx{0.0f};
+    float score_avx512{0.0f};
+
+    func_scalar(doc_vec.data(), query_vec.data(), DIMENSION, &score_scalar);
+
+    func_avx512(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx512);
+
+    func_avx(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx);
+
+    float epsilon = 0.001;
+    ASSERT_NEAR(score_scalar, score_avx512, epsilon);
+    ASSERT_NEAR(score_scalar, score_avx, epsilon);
+  }
+}
+
+// Target Test Type: avx, avx512, avx512fp16, scalar
+TEST(CosineMetric, TestFp16Cosine) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+
+  auto converter = IndexFactory::CreateConverter("CosineFp16Converter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("Cosine", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+
+  auto func_avx512fp16 = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16);
+
+  auto func_avx512 = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto func_avx = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    float score_avx512fp16{0.0f};
+    float score_avx512{0.0f};
+    float score_avx{0.0f};
+    float score_scalar{0.0f};
+
+    func_avx512fp16(doc_out.data(), query_out.data(),
+                    qmeta_reformer.dimension(), &score_avx512fp16);
+
+    func_avx512(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_avx512);
+
+    func_avx(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+             &score_avx);
+
+    func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_scalar);
+
+    float epsilon = 0.2;
+    ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon);
+    ASSERT_NEAR(score_scalar, score_avx512, epsilon);
+    ASSERT_NEAR(score_scalar, score_avx, epsilon);
+  }
+}
+
+// Target Test Type: avx, avx512, scalar
+TEST(CosineMetric, TestFp32CosineBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  auto converter = IndexFactory::CreateConverter("CosineFp32Converter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("Cosine", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+
+  auto batch_func_avx512 = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto batch_func_avx = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  std::vector<std::string> doc_outs;
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    doc_vecs.push_back(doc_vec);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    doc_outs.push_back(doc_out);
+
+    if (doc_vecs.size() == BATCH_SIZE) {
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        doc_ptrs[k] = doc_vecs[k].data();
+      }
+
+      std::vector<float> score_scalar(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx512(BATCH_SIZE, 0.0f);
+
+      batch_func_scalar(doc_ptrs.data(), query_vec.data(), DIMENSION,
+                        BATCH_SIZE, &score_scalar[0]);
+
+      batch_func_avx512(doc_ptrs.data(), query_vec.data(), DIMENSION,
+                        BATCH_SIZE, &score_avx512[0]);
+
+      batch_func_avx(doc_ptrs.data(), query_vec.data(), DIMENSION, BATCH_SIZE,
+                     &score_avx[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        float epsilon = 0.001;
+        ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon);
+        ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon);
+      }
+
+      doc_vecs.clear();
+      doc_outs.clear();
+    }
+  }
+}
+
+// Target Test Type: avx, avx512, avx512fp16, scalar
+TEST(CosineMetric, TestFp16CosineBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  auto converter = IndexFactory::CreateConverter("CosineFp16Converter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("Cosine", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+
+  auto batch_func_avx512fp16 = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16);
+
+  auto batch_func_avx512 = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto batch_func_avx = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  std::vector<std::string> doc_outs;
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    doc_vecs.push_back(doc_vec);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+    doc_outs.push_back(doc_out);
+
+    if (doc_vecs.size() == BATCH_SIZE) {
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        doc_ptrs[k] = doc_outs[k].data();
+      }
+
+      std::vector<float> score_avx512fp16(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx512(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx(BATCH_SIZE, 0.0f);
+      std::vector<float> score_scalar(BATCH_SIZE, 0.0f);
+
+      batch_func_avx512fp16(doc_ptrs.data(), query_out.data(),
+                            qmeta_reformer.dimension(), BATCH_SIZE,
+                            &score_avx512fp16[0]);
+
+      batch_func_avx512(doc_ptrs.data(), query_out.data(),
+                        qmeta_reformer.dimension(), BATCH_SIZE,
+                        &score_avx512[0]);
+
+      batch_func_avx(doc_ptrs.data(), query_out.data(),
+                     qmeta_reformer.dimension(), BATCH_SIZE, &score_avx[0]);
+
+      batch_func_scalar(doc_ptrs.data(), query_out.data(),
+                        qmeta_reformer.dimension(), BATCH_SIZE,
+                        &score_scalar[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        float epsilon = 0.2;
+        ASSERT_NEAR(score_scalar[j], score_avx512fp16[j], epsilon);
+        ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon);
+        ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon);
+      }
+
+      doc_vecs.clear();
+      doc_outs.clear();
+    }
+  }
+}
diff --git a/tests/turbo/turbo_euclidean_test.cc b/tests/turbo/turbo_euclidean_test.cc
new file mode 100644
index 000000000..8388489f4
--- /dev/null
+++ b/tests/turbo/turbo_euclidean_test.cc
@@ -0,0 +1,316 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#include <gtest/gtest.h>
+#include <zvec/ailego/container/params.h>
+#include <zvec/turbo/turbo.h>
+#include "zvec/core/framework/index_factory.h"
+
+using namespace zvec;
+using namespace zvec::core;
+using namespace zvec::ailego;
+
+// Target Test Type: avx, avx512, scalar
+TEST(SquaredEuclideanMetric, TestFp32SquaredEuclidean) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+
+  auto func_avx512 = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto func_avx = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    float score_scalar{0.0f};
+    float score_avx{0.0f};
+    float score_avx512{0.0f};
+
+    func_scalar(doc_vec.data(), query_vec.data(), DIMENSION, &score_scalar);
+
+    func_avx512(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx512);
+
+    func_avx(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx);
+
+    float epsilon = 0.001;
+    ASSERT_NEAR(score_scalar, score_avx512, epsilon);
+    ASSERT_NEAR(score_scalar, score_avx, epsilon);
+  }
+}
+
+// Target Test Type: avx, avx512, avx512fp16, scalar
+TEST(SquaredEuclideanMetric, TestFp16SquaredEuclidean) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+
+  auto converter = IndexFactory::CreateConverter("HalfFloatConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+
+  auto func_avx512fp16 = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16);
+
+  auto func_avx512 = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto func_avx = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    float score_avx512fp16{0.0f};
+    float score_avx512{0.0f};
+    float score_avx{0.0f};
+    float score_scalar{0.0f};
+
+    func_avx512fp16(doc_out.data(), query_out.data(),
+                    qmeta_reformer.dimension(), &score_avx512fp16);
+
+    func_avx512(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_avx512);
+
+    func_avx(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+             &score_avx);
+
+    func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_scalar);
+
+    float epsilon = 0.2;
+    ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon);
+    ASSERT_NEAR(score_scalar, score_avx512, epsilon);
+    ASSERT_NEAR(score_scalar, score_avx, epsilon);
+  }
+}
+
+// Target Test Type: avx, avx512, scalar
+TEST(SquaredEuclideanMetric, TestFp32SquaredEuclideanBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  auto batch_func_avx512 = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto batch_func_avx = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+    doc_vecs.push_back(doc_vec);
+
+    if (doc_vecs.size() == BATCH_SIZE) {
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        doc_ptrs[k] = doc_vecs[k].data();
+      }
+
+      std::vector<float> score_scalar(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx512(BATCH_SIZE, 0.0f);
+
+      batch_func_scalar(doc_ptrs.data(), query_vec.data(), DIMENSION,
+                        BATCH_SIZE, &score_scalar[0]);
+
+      batch_func_avx512(doc_ptrs.data(), query_vec.data(), DIMENSION,
+                        BATCH_SIZE, &score_avx512[0]);
+
+      batch_func_avx(doc_ptrs.data(), query_vec.data(), DIMENSION, BATCH_SIZE,
+                     &score_avx[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        float epsilon = 0.001;
+        ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon);
+        ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon);
+      }
+
+      doc_vecs.clear();
+    }
+  }
+}
+
+// Target Test Type: avx, avx512, avx512fp16, scalar
+TEST(SquaredEuclideanMetric, TestFp16SquaredEuclideanBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  auto converter = IndexFactory::CreateConverter("HalfFloatConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+
+  auto batch_func_avx512fp16 = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16);
+
+  auto batch_func_avx512 = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto batch_func_avx = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  std::vector<std::string> doc_outs;
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+    doc_vecs.push_back(doc_vec);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    doc_outs.push_back(doc_out);
+
+    if (doc_vecs.size() == BATCH_SIZE) {
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        doc_ptrs[k] = doc_outs[k].data();
+      }
+
+      std::vector<float> score_avx512fp16(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx512(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx(BATCH_SIZE, 0.0f);
+      std::vector<float> score_scalar(BATCH_SIZE, 0.0f);
+
+      batch_func_avx512fp16(doc_ptrs.data(), query_out.data(),
+                            qmeta_reformer.dimension(), BATCH_SIZE,
+                            &score_avx512fp16[0]);
+
+      batch_func_avx512(doc_ptrs.data(), query_out.data(),
+                        qmeta_reformer.dimension(), BATCH_SIZE,
+                        &score_avx512[0]);
+
+      batch_func_avx(doc_ptrs.data(), query_out.data(),
+                     qmeta_reformer.dimension(), BATCH_SIZE, &score_avx[0]);
+
+      batch_func_scalar(doc_ptrs.data(), query_out.data(),
+                        qmeta_reformer.dimension(), BATCH_SIZE,
+                        &score_scalar[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        float epsilon = 0.2;
+        ASSERT_NEAR(score_scalar[j], score_avx512fp16[j], epsilon);
+        ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon);
+        ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon);
+      }
+
+      doc_vecs.clear();
+      doc_outs.clear();
+    }
+  }
+}
diff --git a/tests/turbo/turbo_inner_product_test.cc b/tests/turbo/turbo_inner_product_test.cc
new file mode 100644
index 000000000..14fc2cfc0
--- /dev/null
+++ b/tests/turbo/turbo_inner_product_test.cc
@@ -0,0 +1,317 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#include <gtest/gtest.h>
+#include <zvec/ailego/container/params.h>
+#include <zvec/turbo/turbo.h>
+#include "zvec/core/framework/index_factory.h"
+
+using namespace zvec;
+using namespace zvec::core;
+using namespace zvec::ailego;
+
+// Target Test Type: avx, avx512, scalar
+TEST(InnerProductMetric, TestFp32InnerProduct) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+
+  auto func_avx512 = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto func_avx = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    float score_scalar{0.0f};
+    float score_avx{0.0f};
+    float score_avx512{0.0f};
+
+    func_scalar(doc_vec.data(), query_vec.data(), DIMENSION, &score_scalar);
+
+    func_avx512(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx512);
+
+    func_avx(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx);
+
+    float epsilon = 0.001;
+    ASSERT_NEAR(score_scalar, score_avx512, epsilon);
+    ASSERT_NEAR(score_scalar, score_avx, epsilon);
+  }
+}
+
+// Target Test Type: avx, avx512, avx512fp16, scalar
+TEST(InnerProductMetric, TestFp16InnerProduct) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+
+  auto converter = IndexFactory::CreateConverter("HalfFloatConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("InnerProduct", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+
+  auto func_avx512fp16 = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16);
+
+  auto func_avx512 = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto func_avx = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    float score_avx512fp16{0.0f};
+    float score_avx512{0.0f};
+    float score_avx{0.0f};
+    float score_scalar{0.0f};
+
+    func_avx512fp16(doc_out.data(), query_out.data(),
+                    qmeta_reformer.dimension(), &score_avx512fp16);
+
+    func_avx512(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_avx512);
+
+    func_avx(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+             &score_avx);
+
+    func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_scalar);
+
+    float epsilon = 0.2;
+    ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon);
+    ASSERT_NEAR(score_scalar, score_avx512, epsilon);
+    ASSERT_NEAR(score_scalar, score_avx, epsilon);
+  }
+}
+
+// Target Test Type: avx, avx512, scalar
+TEST(InnerProductMetric, TestFp32InnerProductBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  auto batch_func_avx512 = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto batch_func_avx = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    doc_vecs.push_back(doc_vec);
+
+    if (doc_vecs.size() == BATCH_SIZE) {
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        doc_ptrs[k] = doc_vecs[k].data();
+      }
+
+      std::vector<float> score_scalar(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx512(BATCH_SIZE, 0.0f);
+
+      batch_func_scalar(doc_ptrs.data(), query_vec.data(), DIMENSION,
+                        BATCH_SIZE, &score_scalar[0]);
+      batch_func_avx512(doc_ptrs.data(), query_vec.data(), DIMENSION,
+                        BATCH_SIZE, &score_avx512[0]);
+      batch_func_avx(doc_ptrs.data(), query_vec.data(), DIMENSION, BATCH_SIZE,
+                     &score_avx[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        float epsilon = 0.001;
+        ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon);
+        ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon);
+      }
+
+      doc_vecs.clear();
+    }
+  }
+}
+
+// Target Test Type: avx, avx512, avx512fp16, scalar
+TEST(InnerProductMetric, TestFp16InnerProductBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  auto converter = IndexFactory::CreateConverter("HalfFloatConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("InnerProduct", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+
+  auto batch_func_avx512fp16 = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16);
+
+  auto batch_func_avx512 = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512);
+
+  auto batch_func_avx = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp16,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  std::vector<std::string> doc_outs;
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    doc_vecs.push_back(doc_vec);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+    doc_outs.push_back(doc_out);
+
+    if (doc_vecs.size() == BATCH_SIZE) {
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        doc_ptrs[k] = doc_outs[k].data();
+      }
+
+      std::vector<float> score_avx512fp16(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx512(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx(BATCH_SIZE, 0.0f);
+      std::vector<float> score_scalar(BATCH_SIZE, 0.0f);
+
+      batch_func_avx512fp16(doc_ptrs.data(), query_out.data(),
+                            qmeta_reformer.dimension(), BATCH_SIZE,
+                            &score_avx512fp16[0]);
+
+      batch_func_avx512(doc_ptrs.data(), query_out.data(),
+                        qmeta_reformer.dimension(), BATCH_SIZE,
+                        &score_avx512[0]);
+
+      batch_func_avx(doc_ptrs.data(), query_out.data(),
+                     qmeta_reformer.dimension(), BATCH_SIZE, &score_avx[0]);
+
+      batch_func_scalar(doc_ptrs.data(), query_out.data(),
+                        qmeta_reformer.dimension(), BATCH_SIZE,
+                        &score_scalar[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        float epsilon = 0.2;
+        ASSERT_NEAR(score_scalar[j], score_avx512fp16[j], epsilon);
+        ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon);
+        ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon);
+      }
+
+      doc_vecs.clear();
+      doc_outs.clear();
+    }
+  }
+}
diff --git a/tests/turbo/turbo_quantized_integer_test.cc b/tests/turbo/turbo_quantized_integer_test.cc
new file mode 100644
index 000000000..3394a27a0
--- /dev/null
+++ b/tests/turbo/turbo_quantized_integer_test.cc
@@ -0,0 +1,1300 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <fstream>
+#include <iostream>
+#include <unordered_set>
+#include <vector>
+#include <ailego/math/distance.h>
+#include <ailego/math/norm_matrix.h>
+#include <ailego/math/normalizer.h>
+#include <gtest/gtest.h>
+#include <zvec/ailego/container/params.h>
+#include <zvec/core/framework/index_factory.h>
+#include <zvec/turbo/turbo.h>
+
+using namespace zvec;
+using namespace zvec::core;
+using namespace zvec::ailego;
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt8InnerProduct) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+
+  auto converter = IndexFactory::CreateConverter("Int8StreamingConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("InnerProduct", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto func_float32 = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto func_avx512vnni = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI);
+
+  auto func_avx2 = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto func_sse = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    float score_float32{0.0f};
+    float score_scalar{0.0f};
+    float score_avx512vnni{0.0f};
+    float score_avx2{0.0f};
+    float score_sse{0.0f};
+
+    func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32);
+
+    func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_scalar);
+
+    func_avx512vnni(doc_out.data(), query_out.data(),
+                    qmeta_reformer.dimension(), &score_avx512vnni);
+
+    func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+              &score_avx2);
+
+    func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+             &score_sse);
+
+    ASSERT_NEAR(score_float32, score_avx512vnni, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_scalar, score_avx2, 0.001);
+    ASSERT_NEAR(score_scalar, score_sse, 0.001);
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt4InnerProduct) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen) * 2;
+  const size_t COUNT = 1024;
+
+  auto converter = IndexFactory::CreateConverter("Int4StreamingConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("InnerProduct", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto func_float32 = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto func_avx2 = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto func_sse = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    float score_float32{0.0f};
+    float score_scalar{0.0f};
+    float score_avx2{0.0f};
+    float score_sse{0.0f};
+
+    func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32);
+
+    func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_scalar);
+
+    func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+              &score_avx2);
+
+    func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+             &score_sse);
+
+    ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_scalar, score_avx2, 0.001);
+    ASSERT_NEAR(score_scalar, score_sse, 0.001);
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+
+  auto converter = IndexFactory::CreateConverter("Int8StreamingConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto func_float32 = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto func_avx2 = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto func_sse = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    float score_float32{0.0f};
+    float score_scalar{0.0f};
+    float score_avx2{0.0f};
+    float score_sse{0.0f};
+
+    func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32);
+
+    func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_scalar);
+
+    func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+              &score_avx2);
+
+    func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+             &score_sse);
+
+    ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_scalar, score_avx2, 0.001);
+    ASSERT_NEAR(score_scalar, score_sse, 0.001);
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen) * 2;
+  const size_t COUNT = 1024;
+
+  auto converter = IndexFactory::CreateConverter("Int4StreamingConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto func_float32 = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto func_avx2 = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto func_sse = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    float score_float32{0.0f};
+    float score_scalar{0.0f};
+    float score_avx2{0.0f};
+    float score_sse{0.0f};
+
+    func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32);
+
+    func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_scalar);
+
+    func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+              &score_avx2);
+
+    func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+             &score_sse);
+
+    ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_scalar, score_avx2, 0.001);
+    ASSERT_NEAR(score_scalar, score_sse, 0.001);
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt8Cosine) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("Cosine", 0, Params());
+
+  // fp32 converter
+  auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter");
+  ASSERT_TRUE(!!fp32_converter);
+  ASSERT_EQ(0u, fp32_converter->init(meta, Params()));
+
+  auto &fp32_convert_meta = fp32_converter->meta();
+  auto fp32_reformer =
+      IndexFactory::CreateReformer(fp32_convert_meta.reformer_name());
+  ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params()));
+
+  // int8 converter
+  auto converter = IndexFactory::CreateConverter("CosineInt8Converter");
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto func_float32 = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto func_avx512vnni = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI);
+
+  auto func_avx2 = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto func_sse = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta fp32_qmeta_reformer;
+
+  std::string fp32_query_out;
+  ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta,
+                                        &fp32_query_out, &fp32_qmeta_reformer));
+  ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension());
+
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    float score_float32{0.0f};
+    float score_scalar{0.0f};
+    float score_avx512vnni{0.0f};
+    float score_avx2{0.0f};
+    float score_sse{0.0f};
+
+    std::string fp32_doc_out;
+    ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out,
+                                          &fp32_qmeta_reformer));
+    ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension());
+
+    func_float32(fp32_query_out.data(), fp32_doc_out.data(),
+                 fp32_qmeta_reformer.dimension(), &score_float32);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_scalar);
+
+    func_avx512vnni(doc_out.data(), query_out.data(),
+                    qmeta_reformer.dimension(), &score_avx512vnni);
+
+    func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+              &score_avx2);
+
+    func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+             &score_sse);
+
+    ASSERT_NEAR(score_float32, score_avx512vnni, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_scalar, score_avx2, 0.001);
+    ASSERT_NEAR(score_scalar, score_sse, 0.001);
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt4Cosine) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen) * 2;
+  const size_t COUNT = 1024;
+
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("Cosine", 0, Params());
+
+  // fp32 converter
+  auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter");
+  ASSERT_TRUE(!!fp32_converter);
+  ASSERT_EQ(0u, fp32_converter->init(meta, Params()));
+
+  auto &fp32_convert_meta = fp32_converter->meta();
+  auto fp32_reformer =
+      IndexFactory::CreateReformer(fp32_convert_meta.reformer_name());
+  ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params()));
+
+  // int4 converter
+  auto converter = IndexFactory::CreateConverter("CosineInt4Converter");
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto func_float32 = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto func_avx2 = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto func_sse = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto func_scalar = turbo::get_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta fp32_qmeta_reformer;
+
+  std::string fp32_query_out;
+  ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta,
+                                        &fp32_query_out, &fp32_qmeta_reformer));
+  ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension());
+
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    float score_float32{0.0f};
+    float score_scalar{0.0f};
+    float score_avx2{0.0f};
+    float score_sse{0.0f};
+
+    std::string fp32_doc_out;
+    ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out,
+                                          &fp32_qmeta_reformer));
+    ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension());
+
+    func_float32(fp32_query_out.data(), fp32_doc_out.data(),
+                 fp32_qmeta_reformer.dimension(), &score_float32);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+                &score_scalar);
+
+    func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+              &score_avx2);
+
+    func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(),
+             &score_sse);
+
+    ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION);
+    ASSERT_NEAR(score_scalar, score_avx2, 0.001);
+    ASSERT_NEAR(score_scalar, score_sse, 0.001);
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt8InnerProductBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  auto converter = IndexFactory::CreateConverter("Int8StreamingConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("InnerProduct", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto batch_func_float32 = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto batch_func_avx512vnni = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI);
+
+  auto batch_func_avx2 = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto batch_func_sse = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  std::vector<std::string> doc_outs;
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    doc_vecs.push_back(doc_vec);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    doc_outs.push_back(doc_out);
+
+    if (doc_vecs.size() == BATCH_SIZE) {
+      std::vector<float> scores_float32(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_scalar(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_avx512vnni(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_avx2(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_sse(BATCH_SIZE, 0.0f);
+
+      // Build pointer arrays for batch functions
+      std::vector<const void *> float_ptrs(BATCH_SIZE);
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        float_ptrs[k] = doc_vecs[k].data();
+        doc_ptrs[k] = doc_outs[k].data();
+      }
+
+      batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE,
+                         DIMENSION, &scores_float32[0]);
+
+      batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                        qmeta_reformer.dimension(), &scores_scalar[0]);
+
+      batch_func_avx512vnni(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                            qmeta_reformer.dimension(), &scores_avx512vnni[0]);
+
+      batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                      qmeta_reformer.dimension(), &scores_avx2[0]);
+
+      batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                     qmeta_reformer.dimension(), &scores_sse[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        ASSERT_NEAR(scores_float32[j], scores_avx512vnni[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_scalar[j], scores_avx2[j], 0.001);
+        ASSERT_NEAR(scores_scalar[j], scores_sse[j], 0.001);
+      }
+
+      doc_outs.clear();
+      doc_vecs.clear();
+    }
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt4InnerProductBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen) * 2;
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  auto converter = IndexFactory::CreateConverter("Int4StreamingConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("InnerProduct", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto batch_func_float32 = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto batch_func_avx2 = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto batch_func_sse = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kInnerProduct, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  std::vector<std::string> doc_outs;
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    doc_vecs.push_back(doc_vec);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    doc_outs.push_back(doc_out);
+
+    if (doc_outs.size() == BATCH_SIZE) {
+      std::vector<float> scores_float32(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_scalar(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_avx2(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_sse(BATCH_SIZE, 0.0f);
+
+      // Build pointer arrays for batch functions
+      std::vector<const void *> float_ptrs(BATCH_SIZE);
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        float_ptrs[k] = doc_vecs[k].data();
+        doc_ptrs[k] = doc_outs[k].data();
+      }
+
+      batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE,
+                         DIMENSION, &scores_float32[0]);
+
+      batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                        qmeta_reformer.dimension(), &scores_scalar[0]);
+
+      batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                      qmeta_reformer.dimension(), &scores_avx2[0]);
+
+      batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                     qmeta_reformer.dimension(), &scores_sse[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_scalar[j], scores_avx2[j], 0.001);
+        ASSERT_NEAR(scores_scalar[j], scores_sse[j], 0.001);
+      }
+
+      doc_outs.clear();
+      doc_vecs.clear();
+    }
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt8SquaredEuclideanBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  auto converter = IndexFactory::CreateConverter("Int8StreamingConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto batch_func_float32 = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto batch_func_avx2 = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto batch_func_sse = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  std::vector<std::string> doc_outs;
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    doc_vecs.push_back(doc_vec);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    doc_outs.push_back(doc_out);
+
+    if (doc_outs.size() == BATCH_SIZE) {
+      std::vector<float> scores_float32(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_scalar(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_avx2(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_sse(BATCH_SIZE, 0.0f);
+
+      // Build pointer arrays for batch functions
+      std::vector<const void *> float_ptrs(BATCH_SIZE);
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        float_ptrs[k] = doc_vecs[k].data();
+        doc_ptrs[k] = doc_outs[k].data();
+      }
+
+      batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE,
+                         DIMENSION, &scores_float32[0]);
+
+      batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                        qmeta_reformer.dimension(), &scores_scalar[0]);
+
+      batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                      qmeta_reformer.dimension(), &scores_avx2[0]);
+
+      batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                     qmeta_reformer.dimension(), &scores_sse[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_scalar[j], scores_avx2[j], 0.001);
+        ASSERT_NEAR(scores_scalar[j], scores_sse[j], 0.001);
+      }
+
+      doc_outs.clear();
+      doc_vecs.clear();
+    }
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt4SquaredEuclideanBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen) * 2;
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  auto converter = IndexFactory::CreateConverter("Int4StreamingConverter");
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto batch_func_float32 = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto batch_func_avx2 = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto batch_func_sse = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta qmeta_reformer;
+
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  std::vector<std::string> doc_outs;
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    doc_vecs.push_back(doc_vec);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    doc_outs.push_back(doc_out);
+
+    if (doc_outs.size() == BATCH_SIZE) {
+      std::vector<float> scores_float32(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_scalar(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_avx2(BATCH_SIZE, 0.0f);
+      std::vector<float> scores_sse(BATCH_SIZE, 0.0f);
+
+      // Build pointer arrays for batch functions
+      std::vector<const void *> float_ptrs(BATCH_SIZE);
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        float_ptrs[k] = doc_vecs[k].data();
+        doc_ptrs[k] = doc_outs[k].data();
+      }
+
+      batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE,
+                         DIMENSION, &scores_float32[0]);
+
+      batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                        qmeta_reformer.dimension(), &scores_scalar[0]);
+
+      batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                      qmeta_reformer.dimension(), &scores_avx2[0]);
+
+      batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                     qmeta_reformer.dimension(), &scores_sse[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.001);
+        ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.001);
+      }
+
+      doc_outs.clear();
+      doc_vecs.clear();
+    }
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt8CosineBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen);
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("Cosine", 0, Params());
+
+  // fp32 converter
+  auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter");
+  ASSERT_TRUE(!!fp32_converter);
+  ASSERT_EQ(0u, fp32_converter->init(meta, Params()));
+
+  auto &fp32_convert_meta = fp32_converter->meta();
+  auto fp32_reformer =
+      IndexFactory::CreateReformer(fp32_convert_meta.reformer_name());
+  ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params()));
+
+  // int8 converter
+  auto converter = IndexFactory::CreateConverter("CosineInt8Converter");
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto batch_func_float32 = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto batch_func_avx512vnni = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI);
+
+  auto batch_func_avx2 = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto batch_func_sse = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt8,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta fp32_qmeta_reformer;
+
+  std::string fp32_query_out;
+  ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta,
+                                        &fp32_query_out, &fp32_qmeta_reformer));
+  ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension());
+
+  IndexQueryMeta qmeta_reformer;
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  std::vector<std::string> doc_outs;
+  std::vector<std::string> fp32_doc_outs;
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    doc_vecs.push_back(doc_vec);
+
+    std::string fp32_doc_out;
+    ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out,
+                                          &fp32_qmeta_reformer));
+    ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension());
+
+    fp32_doc_outs.push_back(fp32_doc_out);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    doc_outs.push_back(doc_out);
+
+    if (doc_outs.size() == BATCH_SIZE) {
+      std::vector<float> score_float32(BATCH_SIZE, 0.0f);
+      std::vector<float> score_scalar(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx512vnni(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx2(BATCH_SIZE, 0.0f);
+      std::vector<float> score_sse(BATCH_SIZE, 0.0f);
+
+      // Build pointer arrays for batch functions
+      std::vector<const void *> fp32_doc_ptrs(BATCH_SIZE);
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        fp32_doc_ptrs[k] = fp32_doc_outs[k].data();
+        doc_ptrs[k] = doc_outs[k].data();
+      }
+
+      batch_func_float32(fp32_doc_ptrs.data(), fp32_query_out.data(),
+                         BATCH_SIZE, fp32_qmeta_reformer.dimension(),
+                         &score_float32[0]);
+
+      batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                        qmeta_reformer.dimension(), &score_scalar[0]);
+
+      batch_func_avx512vnni(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                            qmeta_reformer.dimension(), &score_avx512vnni[0]);
+
+      batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                      qmeta_reformer.dimension(), &score_avx2[0]);
+
+      batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                     qmeta_reformer.dimension(), &score_sse[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        ASSERT_NEAR(score_float32[j], score_avx512vnni[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(score_float32[j], score_avx2[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(score_float32[j], score_sse[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(score_float32[j], score_scalar[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(score_scalar[j], score_avx2[j], 0.001);
+        ASSERT_NEAR(score_scalar[j], score_sse[j], 0.001);
+      }
+
+      doc_outs.clear();
+      doc_vecs.clear();
+      fp32_doc_outs.clear();
+    }
+  }
+}
+
+// Target Test Type: avx2, sse, scalar
+TEST(QuantizedIntegerMetric, TestInt4CosineBatch) {
+  std::mt19937 gen(15583);
+  std::uniform_real_distribution<float> dist(-1.0, 2.0);
+
+  const size_t DIMENSION = std::uniform_int_distribution<int>(1, 128)(gen) * 2;
+  const size_t COUNT = 1024;
+  const size_t BATCH_SIZE = 16;
+
+  IndexMeta meta(IndexMeta::DT_FP32, DIMENSION);
+  meta.set_metric("Cosine", 0, Params());
+
+  // fp32 converter
+  auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter");
+  ASSERT_TRUE(!!fp32_converter);
+  ASSERT_EQ(0u, fp32_converter->init(meta, Params()));
+
+  auto &fp32_convert_meta = fp32_converter->meta();
+  auto fp32_reformer =
+      IndexFactory::CreateReformer(fp32_convert_meta.reformer_name());
+  ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params()));
+
+  // int4 converter
+  auto converter = IndexFactory::CreateConverter("CosineInt4Converter");
+  ASSERT_TRUE(!!converter);
+  ASSERT_EQ(0u, converter->init(meta, Params()));
+  auto &convert_meta = converter->meta();
+  auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name());
+  ASSERT_EQ(0, reformer->init(convert_meta.reformer_params()));
+
+  auto batch_func_float32 = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kFp32,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto);
+
+  auto batch_func_avx2 = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2);
+
+  auto batch_func_sse = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE);
+
+  auto batch_func_scalar = turbo::get_batch_distance_func(
+      turbo::MetricType::kCosine, turbo::DataType::kInt4,
+      turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar);
+
+  ailego::NumericalVector<float> query_vec(DIMENSION);
+  for (size_t j = 0; j < DIMENSION; ++j) {
+    query_vec[j] = dist(gen);
+  }
+
+  IndexQueryMeta qmeta;
+  qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION);
+  IndexQueryMeta fp32_qmeta_reformer;
+
+  std::string fp32_query_out;
+  ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta,
+                                        &fp32_query_out, &fp32_qmeta_reformer));
+  ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension());
+
+  IndexQueryMeta qmeta_reformer;
+  std::string query_out;
+  ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out,
+                                   &qmeta_reformer));
+  ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+  std::vector<ailego::NumericalVector<float>> doc_vecs;
+  std::vector<std::string> doc_outs;
+  std::vector<std::string> fp32_doc_outs;
+
+  for (size_t i = 0; i < COUNT; ++i) {
+    ailego::NumericalVector<float> doc_vec(DIMENSION);
+    for (size_t j = 0; j < DIMENSION; ++j) {
+      doc_vec[j] = dist(gen);
+    }
+
+    doc_vecs.push_back(doc_vec);
+
+    std::string fp32_doc_out;
+    ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out,
+                                          &fp32_qmeta_reformer));
+    ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension());
+
+    fp32_doc_outs.push_back(fp32_doc_out);
+
+    std::string doc_out;
+    ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out,
+                                     &qmeta_reformer));
+    ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension());
+
+    doc_outs.push_back(doc_out);
+
+    if (doc_outs.size() == BATCH_SIZE) {
+      std::vector<float> score_float32(BATCH_SIZE, 0.0f);
+      std::vector<float> score_scalar(BATCH_SIZE, 0.0f);
+      std::vector<float> score_avx2(BATCH_SIZE, 0.0f);
+      std::vector<float> score_sse(BATCH_SIZE, 0.0f);
+
+      // Build pointer arrays for batch functions
+      std::vector<const void *> fp32_doc_ptrs(BATCH_SIZE);
+      std::vector<const void *> doc_ptrs(BATCH_SIZE);
+      for (size_t k = 0; k < BATCH_SIZE; ++k) {
+        fp32_doc_ptrs[k] = fp32_doc_outs[k].data();
+        doc_ptrs[k] = doc_outs[k].data();
+      }
+
+      batch_func_float32(fp32_doc_ptrs.data(), fp32_query_out.data(),
+                         BATCH_SIZE, fp32_qmeta_reformer.dimension(),
+                         &score_float32[0]);
+
+      batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                        qmeta_reformer.dimension(), &score_scalar[0]);
+
+      batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                      qmeta_reformer.dimension(), &score_avx2[0]);
+
+      batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE,
+                     qmeta_reformer.dimension(), &score_sse[0]);
+
+      for (size_t j = 0; j < BATCH_SIZE; ++j) {
+        ASSERT_NEAR(score_float32[j], score_avx2[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(score_float32[j], score_sse[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(score_float32[j], score_scalar[j], 0.2 * DIMENSION);
+        ASSERT_NEAR(score_scalar[j], score_avx2[j], 0.001);
+        ASSERT_NEAR(score_scalar[j], score_sse[j], 0.001);
+      }
+
+      doc_outs.clear();
+      doc_vecs.clear();
+      fp32_doc_outs.clear();
+    }
+  }
+}
\ No newline at end of file