diff --git a/src/core/framework/index_meta.cc b/src/core/framework/index_meta.cc index 11d54cb63..d0eadb02d 100644 --- a/src/core/framework/index_meta.cc +++ b/src/core/framework/index_meta.cc @@ -30,7 +30,8 @@ struct IndexMetaFormatHeader { uint32_t space_id; uint32_t attachment_offset; uint32_t attachment_size; - uint8_t reserved_[4092]; + uint32_t extra_meta_size; + uint8_t reserved_[4088]; }; static_assert(sizeof(IndexMetaFormatHeader) % 32 == 0, @@ -47,6 +48,7 @@ void IndexMeta::serialize(std::string *out) const { format.dimension = dimension_; format.unit_size = unit_size_; format.space_id = space_id_; + format.extra_meta_size = extra_meta_size_; if (!metric_name_.empty()) { ailego::Params item; diff --git a/src/core/metric/quantized_integer_metric.cc b/src/core/metric/quantized_integer_metric.cc index e4db83146..bbb2e587d 100644 --- a/src/core/metric/quantized_integer_metric.cc +++ b/src/core/metric/quantized_integer_metric.cc @@ -96,24 +96,44 @@ class QuantizedIntegerMetric : public IndexMetric { switch (origin_metric_type_) { case MetricType::kSquaredEuclidean: if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { - auto turbo_ret = turbo::get_distance_func( - turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, - turbo::QuantizeType::kDefault); + auto turbo_ret = + turbo::get_distance_func(turbo::MetricType::kSquaredEuclidean, + turbo::DataType::kInt8, quantize_type_); if (turbo_ret && m == 1 && n == 1) { return turbo_ret; } return DistanceMatrixCompute(m, n); } if (meta_.data_type() == IndexMeta::DataType::DT_INT4) { + auto turbo_ret = + turbo::get_distance_func(turbo::MetricType::kSquaredEuclidean, + turbo::DataType::kInt4, quantize_type_); + if (turbo_ret && m == 1 && n == 1) { + return turbo_ret; + } + return DistanceMatrixCompute(m, n); } break; case MetricType::kInnerProduct: if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { + auto turbo_ret = + turbo::get_distance_func(turbo::MetricType::kInnerProduct, + turbo::DataType::kInt8, quantize_type_); + if (turbo_ret && m == 1 && n == 1) { + return turbo_ret; + } return DistanceMatrixCompute(m, n); } + if (meta_.data_type() == IndexMeta::DataType::DT_INT4) { + auto turbo_ret = + turbo::get_distance_func(turbo::MetricType::kInnerProduct, + turbo::DataType::kInt4, quantize_type_); + if (turbo_ret && m == 1 && n == 1) { + return turbo_ret; + } return DistanceMatrixCompute(m, n); } break; @@ -137,9 +157,9 @@ class QuantizedIntegerMetric : public IndexMetric { break; case MetricType::kCosine: if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { - auto turbo_ret = turbo::get_distance_func( - turbo::MetricType::kCosine, turbo::DataType::kInt8, - turbo::QuantizeType::kDefault); + auto turbo_ret = + turbo::get_distance_func(turbo::MetricType::kCosine, + turbo::DataType::kInt8, quantize_type_); if (turbo_ret) { return turbo_ret; } @@ -160,7 +180,7 @@ class QuantizedIntegerMetric : public IndexMetric { if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { auto turbo_ret = turbo::get_batch_distance_func( turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, - turbo::QuantizeType::kDefault); + quantize_type_); if (turbo_ret) { return turbo_ret; } @@ -215,7 +235,7 @@ class QuantizedIntegerMetric : public IndexMetric { if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { auto turbo_ret = turbo::get_batch_distance_func( turbo::MetricType::kCosine, turbo::DataType::kInt8, - turbo::QuantizeType::kDefault); + quantize_type_); if (turbo_ret) { return turbo_ret; } diff --git a/src/include/zvec/core/framework/index_meta.h b/src/include/zvec/core/framework/index_meta.h index 3a09aaefb..a11af00f4 100644 --- a/src/include/zvec/core/framework/index_meta.h +++ b/src/include/zvec/core/framework/index_meta.h @@ -40,6 +40,7 @@ class IndexMeta { DT_BINARY64 = 8, }; + /*! Major Orders */ enum MajorOrder { @@ -586,6 +587,7 @@ class IndexMeta { uint32_t dimension_{0}; uint32_t unit_size_{0}; uint32_t element_size_{0}; + uint32_t extra_meta_size_{0}; uint64_t space_id_{0}; uint32_t metric_revision_{0}; uint32_t converter_revision_{0}; @@ -708,6 +710,8 @@ class IndexQueryMeta { uint32_t dimension_{0}; uint32_t unit_size_{0}; uint32_t element_size_{0}; + uint32_t extra_meta_size_{0}; + uint32_t quantize_type_{0}; }; } // namespace core diff --git a/src/include/zvec/core/framework/index_metric.h b/src/include/zvec/core/framework/index_metric.h index 24d772362..eeb54099f 100644 --- a/src/include/zvec/core/framework/index_metric.h +++ b/src/include/zvec/core/framework/index_metric.h @@ -137,6 +137,9 @@ struct IndexMetric : public IndexModule { virtual DistanceBatchQueryPreprocessFunc get_query_preprocess_func() const { return nullptr; } + + private: + int quantize_type_{0}; }; } // namespace core diff --git a/src/include/zvec/turbo/turbo.h b/src/include/zvec/turbo/turbo.h index 6ecbfdd1e..f07ace8c6 100644 --- a/src/include/zvec/turbo/turbo.h +++ b/src/include/zvec/turbo/turbo.h @@ -28,28 +28,51 @@ using QueryPreprocessFunc = enum class MetricType { kSquaredEuclidean, kCosine, + kInnerProduct, kMipsSquaredEuclidean, kUnknown, }; enum class DataType { + kInt4, kInt8, + kFp16, + kFp32, kUnknown, }; enum class QuantizeType { kDefault, + kRecordInt8, + kRecordInt4, + kInt8, + kInt4, + kFp16, + kPQ, + kRabit +}; + +enum class CpuArchType { + kAuto, + kScalar, + kSSE, + kAVX, + kAVX2, + kAVX512, + kAVX512VNNI, + kAVX512FP16 }; DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, - QuantizeType quantize_type); + QuantizeType quantize_type, + CpuArchType cpu_arch_type = CpuArchType::kAuto); -BatchDistanceFunc get_batch_distance_func(MetricType metric_type, - DataType data_type, - QuantizeType quantize_type); +BatchDistanceFunc get_batch_distance_func( + MetricType metric_type, DataType data_type, QuantizeType quantize_type, + CpuArchType cpu_arch_type = CpuArchType::kAuto); -QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type, - DataType data_type, - QuantizeType quantize_type); +QueryPreprocessFunc get_query_preprocess_func( + MetricType metric_type, DataType data_type, QuantizeType quantize_type, + CpuArchType cpu_arch_type = CpuArchType::kAuto); } // namespace zvec::turbo diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index 9cbb2fac7..e51f72b1a 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -13,18 +13,71 @@ endif() file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h) -# Set per-file compile flags for AVX512-VNNI sources. -# set_source_files_properties is directory-scoped, so it must be called in the -# same directory that adds the sources to a target (i.e. here, not in a -# subdirectory). if(NOT ANDROID AND AUTO_DETECT_ARCH) if (HOST_ARCH MATCHES "^(x86|x64)$") - file(GLOB_RECURSE AVX512_VNNI_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512_vnni/*.cc) + file(GLOB_RECURSE AVX512_AVX512FP16_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/avx512_fp16/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/avx512_fp16/*.c) + set_source_files_properties( + ${AVX512_AVX512FP16_SRCS} + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512FP16}" + ) + + # Set per-file compile flags for AVX512-VNNI sources. + # set_source_files_properties is directory-scoped, so it must be called in the + # same directory that adds the sources to a target (i.e. here, not in a + # subdirectory). + file(GLOB_RECURSE AVX512_VNNI_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/avx512_vnni/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/avx512_vnni/*.c) set_source_files_properties( ${AVX512_VNNI_SRCS} PROPERTIES COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}" ) + + file(GLOB_RECURSE AVX512_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/avx512/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/avx512/*.c) + set_source_files_properties( + ${AVX512_SRCS} + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}" + ) + + file(GLOB_RECURSE AVX2_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/avx2/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/avx2/*.c + ${CMAKE_CURRENT_SOURCE_DIR}/avx/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/avx/*.c) + set_source_files_properties( + ${AVX2_SRCS} + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX2}" + ) + + file(GLOB_RECURSE SSE_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/sse/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/sse/*.c) + set_source_files_properties( + ${SSE_SRCS} + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_SSE}" + ) + elseif (HOST_ARCH MATCHES "^(arm|arm64)$") + set(TURBO_MARCH_FLAG_NEON "-march=armv8-a") + + file(GLOB_RECURSE NEON_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/armv8/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/armv8/*.c + ) + + set_source_files_properties( + ${NEON_SRCS} + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_NEON}" + ) endif() endif() diff --git a/src/turbo/armv8/float32/cosine.cc b/src/turbo/armv8/float32/cosine.cc new file mode 100644 index 000000000..7e2b990d7 --- /dev/null +++ b/src/turbo/armv8/float32/cosine.cc @@ -0,0 +1,62 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "armv8/float32/cosine.h" +#include "armv8/float32/inner_product.h" +#include "armv8/float32/inner_product_common.h" + +namespace zvec::turbo::armv8 { + +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + size_t extra_size, float *distance) { +#if defined(__ARM_NEON) + constexpr size_t extra_dim = 2; + size_t original_dim = dim - extra_dim; + + float ip; + internal::inner_product_fp32_armv8(a, b, original_dim, &ip); + + *distance = 1 - ip; +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __ARM_NEON +} + +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__ARM_NEON) + const int original_dim = dim - 1; + if (original_dim <= 0) { + return; + } + + internal::inner_product_fp32_batch_armv8(vectors, query, n, original_dim, + distances); + + for (int i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__ARM_NEON +} + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/float32/cosine.h b/src/turbo/armv8/float32/cosine.h new file mode 100644 index 000000000..529e11ef3 --- /dev/null +++ b/src/turbo/armv8/float32/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/float32/inner_product.cc b/src/turbo/armv8/float32/inner_product.cc new file mode 100644 index 000000000..7cfbd7784 --- /dev/null +++ b/src/turbo/armv8/float32/inner_product.cc @@ -0,0 +1,52 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__ARM_NEON) +#include +#include +#include "armv8/float32/inner_product.h" +#include "armv8/float32/inner_product_common.h" + +using namespace zvec::turbo::armv8::internal; +#endif + +namespace zvec::turbo::armv8 { + +// Compute squared Euclidean distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + inner_product_fp32_armv8(a, b, dim, distance); +#endif +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__ARM_NEON) + inner_product_fp32_batch_armv8(vectors, query, n, dim, distances); +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif // __ARM_NEON +} + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/float32/inner_product.h b/src/turbo/armv8/float32/inner_product.h new file mode 100644 index 000000000..a1d8b612f --- /dev/null +++ b/src/turbo/armv8/float32/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/float32/inner_product_common.h b/src/turbo/armv8/float32/inner_product_common.h new file mode 100644 index 000000000..26ad45d21 --- /dev/null +++ b/src/turbo/armv8/float32/inner_product_common.h @@ -0,0 +1,142 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__ARM_NEON) +#include +#include +#include +#include + +using namespace zvec::ailego; + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_FP32_GENERAL(m, q, sum) sum += (m * q); + +namespace zvec::turbo::armv8::internal { + +static __attribute__((always_inline)) void inner_product_fp32_armv8( + const void *a, const void *b, size_t size, float *distance) { + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + size; + const float *last_aligned = lhs + ((size >> 3) << 3); + + float32x4_t v_sum_0 = vdupq_n_f32(0); + float32x4_t v_sum_1 = vdupq_n_f32(0); + + for (; lhs != last_aligned; lhs += 8, rhs += 8) { + v_sum_0 = vfmaq_f32(v_sum_0, vld1q_f32(lhs + 0), vld1q_f32(rhs + 0)); + v_sum_1 = vfmaq_f32(v_sum_1, vld1q_f32(lhs + 4), vld1q_f32(rhs + 4)); + } + if (last >= last_aligned + 4) { + v_sum_0 = vfmaq_f32(v_sum_0, vld1q_f32(lhs), vld1q_f32(rhs)); + lhs += 4; + rhs += 4; + } + + float result = vaddvq_f32(vaddq_f32(v_sum_0, v_sum_1)); + switch (last - lhs) { + case 3: + FMA_FP32_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(lhs[0], rhs[0], result) + } + *distance = -result; +} + +template +static __attribute__((always_inline)) void inner_product_fp32_batch_armv8_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vdupq_n_f32(0); + } + + size_t dim = 0; + for (; dim + 64 <= dimensionality; dim += 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32( + v_sum[i], vld1q_f32(reinterpret_cast(query) + dim), + vld1q_f32(reinterpret_cast(vectors[i]) + dim)); + } + } + + if (dim >= dimensionality + 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast(query)+dim), vld1q_f32(reinterpret_cast(vectors[i])+dim))); + } + + dim += 4; + } + + for (size_t i = 0; i < batch_size; ++i) { + float result = vaddvq_f32(v_sum[i]); + switch (last - lhs) { + case 3: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 2], + reinterpret_cast(vectors[i])[dim + 2], + result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 1], + reinterpret_cast(vectors[i])[dim + 1], + result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 0], + reinterpret_cast(vectors[i])[dim + 0], + result) + } + + distances[i] = -result; + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void inner_product_fp32_batch_armv8( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_fp32_batch_armv8_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_fp32_batch_armv8_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); + } +} + +} // namespace zvec::turbo::armv8::internal + +#endif // defined(__ARM_NEON) diff --git a/src/turbo/armv8/float32/squared_euclidean.cc b/src/turbo/armv8/float32/squared_euclidean.cc new file mode 100644 index 000000000..b39fdac2e --- /dev/null +++ b/src/turbo/armv8/float32/squared_euclidean.cc @@ -0,0 +1,54 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__ARM_NEON) +#include +#include +#include "armv8/float32/squared_euclidean.h" +#include "armv8/float32/squared_euclidean_common.h" + +using namespace zvec::turbo::armv8::internal; +#endif + +namespace zvec::turbo::armv8 { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + squared_euclidean_fp32_armv8(a, b, dim, distance); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __ARM_NEON +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__ARM_NEON) + squared_euclidean_fp32_batch_armv8(vectors, query, n, dim, distances); +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif // +} + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/float32/squared_euclidean.h b/src/turbo/armv8/float32/squared_euclidean.h new file mode 100644 index 000000000..3df75f17a --- /dev/null +++ b/src/turbo/armv8/float32/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/float32/squared_euclidean_common.h b/src/turbo/armv8/float32/squared_euclidean_common.h new file mode 100644 index 000000000..4f3419c56 --- /dev/null +++ b/src/turbo/armv8/float32/squared_euclidean_common.h @@ -0,0 +1,150 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__ARM_NEON) +#include +#include +#include +#include + +using namespace zvec::ailego; + +//! Calculate Sum-of-Squared-Differences (GENERAL) +#define SSD_FP32_GENERAL(m, q, sum) \ + { \ + float x = m - q; \ + sum += (x * x); \ + } + +namespace zvec::turbo::armv8::internal { + +static __attribute__((always_inline)) void squared_euclidean_fp32_armv8( + const void *a, const void *b, size_t size, float *distance) { + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + size; + const float *last_aligned = lhs + ((size >> 3) << 3); + + float32x4_t v_sum_0 = vdupq_n_f32(0); + float32x4_t v_sum_1 = vdupq_n_f32(0); + + for (; lhs != last_aligned; lhs += 8, rhs += 8) { + float32x4_t v_d_0 = vsubq_f32(vld1q_f32(lhs + 0), vld1q_f32(rhs + 0)); + float32x4_t v_d_1 = vsubq_f32(vld1q_f32(lhs + 4), vld1q_f32(rhs + 4)); + v_sum_0 = vfmaq_f32(v_sum_0, v_d_0, v_d_0); + v_sum_1 = vfmaq_f32(v_sum_1, v_d_1, v_d_1); + } + if (last >= last_aligned + 4) { + float32x4_t v_d = vsubq_f32(vld1q_f32(lhs), vld1q_f32(rhs)); + v_sum_0 = vfmaq_f32(v_sum_0, v_d, v_d); + lhs += 4; + rhs += 4; + } + + float result = vaddvq_f32(vaddq_f32(v_sum_0, v_sum_1)); + switch (last - lhs) { + case 3: + SSD_FP32_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + SSD_FP32_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + SSD_FP32_GENERAL(lhs[0], rhs[0], result) + } + *distance = result; +} + +template +static __attribute__((always_inline)) void +squared_euclidean_fp32_batch_armv8_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vdupq_n_f32(0); + } + + size_t dim = 0; + for (; dim + 64 <= dimensionality; dim += 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32( + v_sum[i], vld1q_f32(reinterpret_cast(query) + dim), + vld1q_f32(reinterpret_cast(vectors[i]) + dim)); + } + } + + if (dim >= dimensionality + 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast(query)+dim), vld1q_f32(reinterpret_cast(vectors[i])+dim))); + } + + dim += 4; + } + + for (size_t i = 0; i < batch_size; ++i) { + float result = vaddvq_f32(v_sum[i]); + switch (last - lhs) { + case 3: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 2], + reinterpret_cast(vectors[i])[dim + 2], + result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 1], + reinterpret_cast(vectors[i])[dim + 1], + result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 0], + reinterpret_cast(vectors[i])[dim + 0], + result) + } + + distances[i] = -result; + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void squared_euclidean_fp32_batch_armv8( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + squared_euclidean_fp32_batch_armv8_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + squared_euclidean_fp32_batch_armv8_impl<1>( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } +} + +} // namespace zvec::turbo::armv8::internal + +#endif // defined(__ARM_NEON) diff --git a/src/turbo/armv8/half_float/cosine.cc b/src/turbo/armv8/half_float/cosine.cc new file mode 100644 index 000000000..baf39c702 --- /dev/null +++ b/src/turbo/armv8/half_float/cosine.cc @@ -0,0 +1,62 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "armv8/half_float/cosine.h" +#include "armv8/half_float/inner_product.h" +#include "armv8/half_float/inner_product_common.h" + +namespace zvec::turbo::armv8 { + +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + constexpr size_t extra_dim = 2; + size_t original_dim = dim - extra_dim; + + float ip; + inner_product_fp16_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __ARM_NEON +} + +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__ARM_NEON) + constexpr size_t extra_dim = 2; + const int original_dim = dim - extra_dim; + if (original_dim <= 0) { + return; + } + + inner_product_fp16_batch_armv8(vectors, query, n, original_dim, distances); + + for (size_t i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__ARM_NEON +} + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/half_float/cosine.h b/src/turbo/armv8/half_float/cosine.h new file mode 100644 index 000000000..7d79f7bd7 --- /dev/null +++ b/src/turbo/armv8/half_float/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/half_float/inner_product.cc b/src/turbo/armv8/half_float/inner_product.cc new file mode 100644 index 000000000..7e0dcc448 --- /dev/null +++ b/src/turbo/armv8/half_float/inner_product.cc @@ -0,0 +1,58 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__ARM_NEON) +#include +#include +#include "armv8/half_float/inner_product.h" +#include "armv8/half_float/inner_product_common.h" + +using namespace zvec::turbo::armv8::internal; +#endif + +namespace zvec::turbo::armv8 { + +// Compute squared Euclidean distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + const zvec::ailego::Float16 *lhs = + reinterpret_cast(a); + const zvec::ailego::Float16 *rhs = + reinterpret_cast(b); + + ACCUM_FP16_1X1_NEON(lhs, rhs, dim, distance, 0ull, ) + +#endif +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__ARM_NEON) + inner_product_fp16_batch_armv8(vectors, query, n, dim, distances); +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__ARM_NEON +} + +} // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/half_float/inner_product.h b/src/turbo/armv8/half_float/inner_product.h new file mode 100644 index 000000000..cfd824459 --- /dev/null +++ b/src/turbo/armv8/half_float/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute inner product distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/half_float/inner_product_common.h b/src/turbo/armv8/half_float/inner_product_common.h new file mode 100644 index 000000000..54c3072ff --- /dev/null +++ b/src/turbo/armv8/half_float/inner_product_common.h @@ -0,0 +1,210 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__ARM_NEON) +#include +#include +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::armv8::internal { + +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + +//! Scalar fused multiply-add for inner product (FP16 general) +#define ACCUM_FP16_STEP_GENERAL(m, q, sum) sum += (m * q); + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +//! NEON fused multiply-add for inner product (FP16) +#define ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum) \ + v_sum = vfmaq_f16(v_sum, v_m, v_q); + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC) \ + { \ + float16x8_t v_m = vld1q_f16((const float16_t *)m); \ + float16x8_t v_q = vld1q_f16((const float16_t *)q); \ + _PROC(v_m, v_q, _RES##_0_0) \ + } + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, float16x8_t, v_sum, vdupq_n_f16(0)) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 3) << 3); \ + for (; q != qe_aligned; m += 8, q += 8) { \ + MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP16_STEP_NEON) \ + } \ + if (qe >= qe_aligned + 4) { \ + float16x8_t v_m = \ + vcombine_f16(vld1_f16((const float16_t *)m), \ + vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK)))); \ + float16x8_t v_q = \ + vcombine_f16(vld1_f16((const float16_t *)q), \ + vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK)))); \ + ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum_0_0) \ + m += 4; \ + q += 4; \ + } \ + float result = vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(v_sum_0_0)), \ + vcvt_high_f32_f16(v_sum_0_0))); \ + switch (qe - q) { \ + case 3: \ + ACCUM_FP16_STEP_GENERAL(m[2], q[2], result) \ + /* FALLTHRU */ \ + case 2: \ + ACCUM_FP16_STEP_GENERAL(m[1], q[1], result) \ + /* FALLTHRU */ \ + case 1: \ + ACCUM_FP16_STEP_GENERAL(m[0], q[0], result) \ + } \ + *out = _NORM(result); + +#else + +//! NEON fused multiply-add for inner product (FP32) +#define ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum) \ + v_sum = vfmaq_f32(v_sum, v_m, v_q); + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC) \ + { \ + float16x8_t v_m = vld1q_f16((const float16_t *)m); \ + float16x8_t v_q = vld1q_f16((const float16_t *)q); \ + float32x4_t v_m_0 = vcvt_f32_f16(vget_low_f16(v_m)); \ + float32x4_t v_q_0 = vcvt_f32_f16(vget_low_f16(v_q)); \ + _PROC(v_m_0, v_q_0, _RES##_0_0) \ + v_m_0 = vcvt_high_f32_f16(v_m); \ + v_q_0 = vcvt_high_f32_f16(v_q); \ + _PROC(v_m_0, v_q_0, _RES##_0_0) \ + } + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, float32x4_t, v_sum, vdupq_n_f32(0)) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 3) << 3); \ + for (; q != qe_aligned; m += 8, q += 8) { \ + MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP32_STEP_NEON) \ + } \ + if (qe >= qe_aligned + 4) { \ + float32x4_t v_m = vcvt_f32_f16(vld1_f16((const float16_t *)m)); \ + float32x4_t v_q = vcvt_f32_f16(vld1_f16((const float16_t *)q)); \ + ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum_0_0) \ + m += 4; \ + q += 4; \ + } \ + float result = vaddvq_f32(v_sum_0_0); \ + switch (qe - q) { \ + case 3: \ + ACCUM_FP16_STEP_GENERAL(m[2], q[2], result) \ + /* FALLTHRU */ \ + case 2: \ + ACCUM_FP16_STEP_GENERAL(m[1], q[1], result) \ + /* FALLTHRU */ \ + case 1: \ + ACCUM_FP16_STEP_GENERAL(m[0], q[0], result) \ + } \ + *out = _NORM(result); + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + + +template +static __attribute__((always_inline)) void inner_product_fp16_batch_armv8_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vdupq_n_f32(0); + } + + size_t dim = 0; + for (; dim + 64 <= dimensionality; dim += 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32( + v_sum[i], vld1q_f32(reinterpret_cast(query) + dim), + vld1q_f32(reinterpret_cast(vectors[i]) + dim)); + } + } + + if (dim >= dimensionality + 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast(query)+dim), vld1q_f32(reinterpret_cast(vectors[i])+dim))); + } + + dim += 4; + } + + for (size_t i = 0; i < batch_size; ++i) { + float result = vaddvq_f32(v_sum[i]); + switch (last - lhs) { + case 3: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 2], + reinterpret_cast(vectors[i])[dim + 2], + result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 1], + reinterpret_cast(vectors[i])[dim + 1], + result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 0], + reinterpret_cast(vectors[i])[dim + 0], + result) + } + + distances[i] = -result; + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void inner_product_fp16_batch_armv8( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_fp16_batch_armv8_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_fp16_batch_armv8_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); + } +} + +} // namespace zvec::turbo::armv8::internal + +#endif // defined(__ARM_NEON) diff --git a/src/turbo/armv8/half_float/squared_euclidean.cc b/src/turbo/armv8/half_float/squared_euclidean.cc new file mode 100644 index 000000000..5f6ac829b --- /dev/null +++ b/src/turbo/armv8/half_float/squared_euclidean.cc @@ -0,0 +1,59 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__ARM_NEON) +#include +#include +#include "armv8/half_float/squared_euclidean.h" +#include "armv8/half_float/squared_euclidean_common.h" + +using namespace zvec::turbo::armv8::internal; +#endif + +namespace zvec::turbo::armv8 { + +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + const zvec::ailego::Float16 *lhs = + reinterpret_cast(a); + const zvec::ailego::Float16 *rhs = + reinterpret_cast(b); + + ACCUM_FP16_1X1_NEON(lhs, rhs, dim, distance, 0ull, ) +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __ARM_NEON +} + +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__ARM_NEON) + squared_euclidean_fp16_batch_armv8(vectors, query, n, dim, distances); +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__ARM_NEON +} + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/half_float/squared_euclidean.h b/src/turbo/armv8/half_float/squared_euclidean.h new file mode 100644 index 000000000..5a540b590 --- /dev/null +++ b/src/turbo/armv8/half_float/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute squared euclidean distance between a single quantized FP16 +// vector pair. +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP16. +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/half_float/squared_euclidean_common.h b/src/turbo/armv8/half_float/squared_euclidean_common.h new file mode 100644 index 000000000..df3807e61 --- /dev/null +++ b/src/turbo/armv8/half_float/squared_euclidean_common.h @@ -0,0 +1,219 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__ARM_NEON) +#include +#include +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::armv8::internal { + +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + +//! Scalar sum of squared difference (FP16 general) +#define ACCUM_FP16_STEP_GENERAL(m, q, sum) \ + { \ + float x = m - q; \ + sum += (x * x); \ + } + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +//! NEON sum of squared difference (FP16) +#define ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum) \ + { \ + float16x8_t v_d = vsubq_f16(v_m, v_q); \ + v_sum = vfmaq_f16(v_sum, v_d, v_d); \ + } + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC) \ + { \ + float16x8_t v_m = vld1q_f16((const float16_t *)m); \ + float16x8_t v_q = vld1q_f16((const float16_t *)q); \ + _PROC(v_m, v_q, _RES##_0_0) \ + } +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, float16x8_t, v_sum, vdupq_n_f16(0)) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 3) << 3); \ + for (; q != qe_aligned; m += 8, q += 8) { \ + MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP16_STEP_NEON) \ + } \ + if (qe >= qe_aligned + 4) { \ + float16x8_t v_m = \ + vcombine_f16(vld1_f16((const float16_t *)m), \ + vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK)))); \ + float16x8_t v_q = \ + vcombine_f16(vld1_f16((const float16_t *)q), \ + vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK)))); \ + ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum_0_0) \ + m += 4; \ + q += 4; \ + } \ + float result = vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(v_sum_0_0)), \ + vcvt_high_f32_f16(v_sum_0_0))); \ + switch (qe - q) { \ + case 3: \ + ACCUM_FP16_STEP_GENERAL(m[2], q[2], result) \ + /* FALLTHRU */ \ + case 2: \ + ACCUM_FP16_STEP_GENERAL(m[1], q[1], result) \ + /* FALLTHRU */ \ + case 1: \ + ACCUM_FP16_STEP_GENERAL(m[0], q[0], result) \ + } \ + *out = _NORM(result); + +#else + +//! NEON sum of squared difference (FP32) +#define ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum) \ + { \ + float32x4_t v_d = vsubq_f32(v_m, v_q); \ + v_sum = vfmaq_f32(v_sum, v_d, v_d); \ + } + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC) \ + { \ + float16x8_t v_m = vld1q_f16((const float16_t *)m); \ + float16x8_t v_q = vld1q_f16((const float16_t *)q); \ + float32x4_t v_m_0 = vcvt_f32_f16(vget_low_f16(v_m)); \ + float32x4_t v_q_0 = vcvt_f32_f16(vget_low_f16(v_q)); \ + _PROC(v_m_0, v_q_0, _RES##_0_0) \ + v_m_0 = vcvt_high_f32_f16(v_m); \ + v_q_0 = vcvt_high_f32_f16(v_q); \ + _PROC(v_m_0, v_q_0, _RES##_0_0) \ + } + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, float32x4_t, v_sum, vdupq_n_f32(0)) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 3) << 3); \ + for (; q != qe_aligned; m += 8, q += 8) { \ + MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP32_STEP_NEON) \ + } \ + if (qe >= qe_aligned + 4) { \ + float32x4_t v_m = vcvt_f32_f16(vld1_f16((const float16_t *)m)); \ + float32x4_t v_q = vcvt_f32_f16(vld1_f16((const float16_t *)q)); \ + ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum_0_0) \ + m += 4; \ + q += 4; \ + } \ + float result = vaddvq_f32(v_sum_0_0); \ + switch (qe - q) { \ + case 3: \ + ACCUM_FP16_STEP_GENERAL(m[2], q[2], result) \ + /* FALLTHRU */ \ + case 2: \ + ACCUM_FP16_STEP_GENERAL(m[1], q[1], result) \ + /* FALLTHRU */ \ + case 1: \ + ACCUM_FP16_STEP_GENERAL(m[0], q[0], result) \ + } \ + *out = _NORM(result); + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + + +template +static __attribute__((always_inline)) void +squared_euclidean_fp16_batch_armv8_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vdupq_n_f32(0); + } + + size_t dim = 0; + for (; dim + 64 <= dimensionality; dim += 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32( + v_sum[i], vld1q_f32(reinterpret_cast(query) + dim), + vld1q_f32(reinterpret_cast(vectors[i]) + dim)); + } + } + + if (dim >= dimensionality + 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast(query)+dim), vld1q_f32(reinterpret_cast(vectors[i])+dim))); + } + + dim += 4; + } + + for (size_t i = 0; i < batch_size; ++i) { + float result = vaddvq_f32(v_sum[i]); + switch (last - lhs) { + case 3: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 2], + reinterpret_cast(vectors[i])[dim + 2], + result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 1], + reinterpret_cast(vectors[i])[dim + 1], + result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 0], + reinterpret_cast(vectors[i])[dim + 0], + result) + } + + distances[i] = -result; + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void squared_euclidean_fp16_batch_armv8( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + squared_euclidean_fp16_batch_armv8_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + squared_euclidean_fp16_batch_armv8_impl<1>( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } +} +} // namespace zvec::turbo::armv8::internal + +#endif // defined(__ARM_NEON) diff --git a/src/turbo/avx/float32/common.h b/src/turbo/avx/float32/common.h new file mode 100644 index 000000000..acd06f0de --- /dev/null +++ b/src/turbo/avx/float32/common.h @@ -0,0 +1,166 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX__) + +#include +#include +#include +#include + +#define SSD_FP32_GENERAL(m, q, sum) \ + { \ + float x = m - q; \ + sum += (x * x); \ + } + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_FP32_GENERAL(m, q, sum) sum += (m * q); + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +static inline float sum4(__m128 v) { + v = _mm_add_ps(v, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 8))); + return _mm_cvtss_f32(v) + _mm_cvtss_f32(_mm_shuffle_ps(v, v, 1)); +} + +static inline __m128 sum_top_bottom_avx(__m256 v) { + const __m128 high = _mm256_extractf128_ps(v, 1); + const __m128 low = _mm256_castps256_ps128(v); + return _mm_add_ps(high, low); +} + + +template +static std::enable_if_t, void> +inner_product_fp32_batch_avx_impl( + const ValueType *query, const ValueType *const *ptrs, + std::array &prefetch_ptrs, + size_t dimensionality, float *results) { + __m256 accs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + accs[i] = _mm256_setzero_ps(); + } + size_t dim = 0; + for (; dim + 8 <= dimensionality; dim += 8) { + __m256 q = _mm256_loadu_ps(query + dim); + + __m256 data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm256_loadu_ps(ptrs[i] + dim); + } + if (prefetch_ptrs[0]) { + for (size_t i = 0; i < dp_batch; ++i) { + ailego_prefetch(prefetch_ptrs[i] + dim); + } + } + for (size_t i = 0; i < dp_batch; ++i) { + accs[i] = _mm256_fnmadd_ps(q, data_regs[i], accs[i]); + } + } + + __m128 sum128_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + sum128_regs[i] = sum_top_bottom_avx(accs[i]); + } + if (dim + 4 <= dimensionality) { + __m128 q = _mm_loadu_ps(query + dim); + + __m128 data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm_loadu_ps(ptrs[i] + dim); + } + if (prefetch_ptrs[0]) { + for (size_t i = 0; i < dp_batch; ++i) { + ailego_prefetch(prefetch_ptrs[i] + dim); + } + } + for (size_t i = 0; i < dp_batch; ++i) { + sum128_regs[i] = _mm_fnmadd_ps(q, data_regs[i], sum128_regs[i]); + } + dim += 4; + } + if (dim + 2 <= dimensionality) { + __m128 q = _mm_setzero_ps(); + + __m128 data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm_setzero_ps(); + } + + q = _mm_loadh_pi(q, (const __m64 *)(query + dim)); + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm_loadh_pi(data_regs[i], (const __m64 *)(ptrs[i] + dim)); + } + for (size_t i = 0; i < dp_batch; ++i) { + sum128_regs[i] = _mm_fnmadd_ps(q, data_regs[i], sum128_regs[i]); + } + dim += 2; + } + + float res[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + res[i] = sum4(sum128_regs[i]); + } + if (dim < dimensionality) { + float q = query[dim]; + for (size_t i = 0; i < dp_batch; ++i) { + res[i] -= q * ptrs[i][dim]; + } + } + for (size_t i = 0; i < dp_batch; ++i) { + results[i] = -res[i]; + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void inner_product_fp32_batch_avx( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + const float *typed_query = reinterpret_cast(query); + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = reinterpret_cast( + vectors[i + j + batch_size * prefetch_step]); + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_fp32_batch_avx_impl( + typed_query, reinterpret_cast(&vectors[i]), + prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_fp32_batch_avx_impl( + typed_query, reinterpret_cast(&vectors[i]), + prefetch_ptrs, dim, distances + i); + } +} + + +#endif \ No newline at end of file diff --git a/src/turbo/avx/float32/cosine.cc b/src/turbo/avx/float32/cosine.cc new file mode 100644 index 000000000..d2f94f4bf --- /dev/null +++ b/src/turbo/avx/float32/cosine.cc @@ -0,0 +1,66 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/float32/cosine.h" +#include "avx/float32/common.h" +#include "avx/float32/inner_product.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + constexpr size_t extra_dim = 1; + size_t d = dim - extra_dim; + + float ip; + inner_product_fp32_distance(a, b, d, &ip); + + *distance = 1 - ip; +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX__) + constexpr size_t extra_dim = 1; + const int original_dim = dim - extra_dim; + if (original_dim <= 0) { + return; + } + + inner_product_fp32_batch_distance(vectors, query, n, original_dim, distances); + + for (int i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/float32/cosine.h b/src/turbo/avx/float32/cosine.h new file mode 100644 index 000000000..514a705e0 --- /dev/null +++ b/src/turbo/avx/float32/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/float32/inner_product.cc b/src/turbo/avx/float32/inner_product.cc new file mode 100644 index 000000000..10b30eee3 --- /dev/null +++ b/src/turbo/avx/float32/inner_product.cc @@ -0,0 +1,120 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/float32/inner_product.h" +#include "avx/float32/common.h" + +#if defined(__AVX__) +#include +#include +#endif + +namespace zvec::turbo::avx { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + dim; + const float *last_aligned = lhs + ((dim >> 4) << 4); + + __m256 ymm_sum_0 = _mm256_setzero_ps(); + __m256 ymm_sum_1 = _mm256_setzero_ps(); + + if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m256 ymm_lhs_0 = _mm256_load_ps(lhs + 0); + __m256 ymm_lhs_1 = _mm256_load_ps(lhs + 8); + __m256 ymm_rhs_0 = _mm256_load_ps(rhs + 0); + __m256 ymm_rhs_1 = _mm256_load_ps(rhs + 8); + ymm_sum_0 = _mm256_fmadd_ps(ymm_lhs_0, ymm_rhs_0, ymm_sum_0); + ymm_sum_1 = _mm256_fmadd_ps(ymm_lhs_1, ymm_rhs_1, ymm_sum_1); + } + + if (last >= last_aligned + 8) { + ymm_sum_0 = + _mm256_fmadd_ps(_mm256_load_ps(lhs), _mm256_load_ps(rhs), ymm_sum_0); + lhs += 8; + rhs += 8; + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m256 ymm_lhs_0 = _mm256_loadu_ps(lhs + 0); + __m256 ymm_lhs_1 = _mm256_loadu_ps(lhs + 8); + __m256 ymm_rhs_0 = _mm256_loadu_ps(rhs + 0); + __m256 ymm_rhs_1 = _mm256_loadu_ps(rhs + 8); + ymm_sum_0 = _mm256_fmadd_ps(ymm_lhs_0, ymm_rhs_0, ymm_sum_0); + ymm_sum_1 = _mm256_fmadd_ps(ymm_lhs_1, ymm_rhs_1, ymm_sum_1); + } + + if (last >= last_aligned + 8) { + ymm_sum_0 = _mm256_fmadd_ps(_mm256_loadu_ps(lhs), _mm256_loadu_ps(rhs), + ymm_sum_0); + lhs += 8; + rhs += 8; + } + } + float result = HorizontalAdd_FP32_V256(_mm256_add_ps(ymm_sum_0, ymm_sum_1)); + + switch (last - lhs) { + case 7: + FMA_FP32_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_FP32_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_FP32_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_FP32_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_FP32_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(lhs[0], rhs[0], result) + } + *distance = -1 * result; +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX__) + inner_product_fp32_batch_avx(vectors, query, n, dim, distances); +#else + (void)vectors; + (void)distances; + (void)query; + (void)n; + (void)dim; +#endif // __AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/float32/inner_product.h b/src/turbo/avx/float32/inner_product.h new file mode 100644 index 000000000..083a35f6f --- /dev/null +++ b/src/turbo/avx/float32/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx diff --git a/src/turbo/avx/float32/squared_euclidean.cc b/src/turbo/avx/float32/squared_euclidean.cc new file mode 100644 index 000000000..9240ea7e9 --- /dev/null +++ b/src/turbo/avx/float32/squared_euclidean.cc @@ -0,0 +1,121 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/float32/squared_euclidean.h" +#include "avx/float32/common.h" + +#if defined(__AVX__) +#include +#include +#endif + +namespace zvec::turbo::avx { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + dim; + const float *last_aligned = lhs + ((dim >> 4) << 4); + + __m256 ymm_sum_0 = _mm256_setzero_ps(); + __m256 ymm_sum_1 = _mm256_setzero_ps(); + + if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m256 ymm_d_0 = + _mm256_sub_ps(_mm256_load_ps(lhs + 0), _mm256_load_ps(rhs + 0)); + __m256 ymm_d_1 = + _mm256_sub_ps(_mm256_load_ps(lhs + 8), _mm256_load_ps(rhs + 8)); + ymm_sum_0 = _mm256_fmadd_ps(ymm_d_0, ymm_d_0, ymm_sum_0); + ymm_sum_1 = _mm256_fmadd_ps(ymm_d_1, ymm_d_1, ymm_sum_1); + } + + if (last >= last_aligned + 8) { + __m256 ymm_d = _mm256_sub_ps(_mm256_load_ps(lhs), _mm256_load_ps(rhs)); + ymm_sum_0 = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum_0); + lhs += 8; + rhs += 8; + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m256 ymm_d_0 = + _mm256_sub_ps(_mm256_loadu_ps(lhs + 0), _mm256_loadu_ps(rhs + 0)); + __m256 ymm_d_1 = + _mm256_sub_ps(_mm256_loadu_ps(lhs + 8), _mm256_loadu_ps(rhs + 8)); + ymm_sum_0 = _mm256_fmadd_ps(ymm_d_0, ymm_d_0, ymm_sum_0); + ymm_sum_1 = _mm256_fmadd_ps(ymm_d_1, ymm_d_1, ymm_sum_1); + } + + if (last >= last_aligned + 8) { + __m256 ymm_d = _mm256_sub_ps(_mm256_loadu_ps(lhs), _mm256_loadu_ps(rhs)); + ymm_sum_0 = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum_0); + lhs += 8; + rhs += 8; + } + } + float result = HorizontalAdd_FP32_V256(_mm256_add_ps(ymm_sum_0, ymm_sum_1)); + + switch (last - lhs) { + case 7: + SSD_FP32_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + SSD_FP32_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + SSD_FP32_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + SSD_FP32_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + SSD_FP32_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + SSD_FP32_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + SSD_FP32_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX__) + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp32_distance(vectors[i], query, dim, &distances[i]); + } +#else + (void)vectors; + (void)distances; + (void)query; + (void)n; + (void)dim; +#endif // __AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/float32/squared_euclidean.h b/src/turbo/avx/float32/squared_euclidean.h new file mode 100644 index 000000000..9e11f15bc --- /dev/null +++ b/src/turbo/avx/float32/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx diff --git a/src/turbo/avx/half_float/cosine.cc b/src/turbo/avx/half_float/cosine.cc new file mode 100644 index 000000000..27a3c7dbd --- /dev/null +++ b/src/turbo/avx/half_float/cosine.cc @@ -0,0 +1,67 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/half_float/cosine.h" +#include "avx/half_float/inner_product.h" +#include "avx/half_float/inner_product_common.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + constexpr size_t extra_dim = 2; + size_t d = dim - extra_dim; + + float ip; + inner_product_fp16_distance(a, b, d, &ip); + + *distance = 1 - ip; +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX__) + constexpr size_t extra_dim = 2; + const int original_dim = dim - extra_dim; + if (original_dim <= 0) { + return; + } + + inner_product_fp16_batch_distance(vectors, query, n, original_dim, distances); + + for (size_t i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/half_float/cosine.h b/src/turbo/avx/half_float/cosine.h new file mode 100644 index 000000000..5bd0a66f5 --- /dev/null +++ b/src/turbo/avx/half_float/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP16 vector pair. +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp16_distance. +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/half_float/inner_product.cc b/src/turbo/avx/half_float/inner_product.cc new file mode 100644 index 000000000..4ac05de2a --- /dev/null +++ b/src/turbo/avx/half_float/inner_product.cc @@ -0,0 +1,58 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/half_float/inner_product.h" +#include "avx/half_float/inner_product_common.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +// Compute squared Euclidean distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + const ailego::Float16 *lhs = reinterpret_cast(a); + const ailego::Float16 *rhs = reinterpret_cast(b); + + ACCUM_FP16_1X1_AVX(lhs, rhs, dim, distance, 0ull, NEGATE_FP32_GENERAL) +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX__) + for (size_t i = 0; i < n; ++i) { + inner_product_fp16_distance(vectors[i], query, dim, &distances[i]); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif // __AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/half_float/inner_product.h b/src/turbo/avx/half_float/inner_product.h new file mode 100644 index 000000000..08b5a8d73 --- /dev/null +++ b/src/turbo/avx/half_float/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute inner product distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx diff --git a/src/turbo/avx/half_float/inner_product_common.h b/src/turbo/avx/half_float/inner_product_common.h new file mode 100644 index 000000000..a6816d022 --- /dev/null +++ b/src/turbo/avx/half_float/inner_product_common.h @@ -0,0 +1,179 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX__) + +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::avx { +//! Reverse sign of value (GENERAL) +#define NEGATE_FP32_GENERAL(v) -(v) + +//! Mask process of computing distance (FP16) +#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC) \ + switch (cnt) { \ + case 7: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(lhs) + 6), \ + *((const short *)(lhs) + 5), *((const short *)(lhs) + 4), \ + *((const short *)(lhs) + 3), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(rhs) + 6), \ + *((const short *)(rhs) + 5), *((const short *)(rhs) + 4), \ + *((const short *)(rhs) + 3), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 6: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2), \ + *((const int *)(lhs) + 1), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2), \ + *((const int *)(rhs) + 1), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 5: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(lhs) + 4), *((const short *)(lhs) + 3), \ + *((const short *)(lhs) + 2), *((const short *)(lhs) + 1), \ + *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(rhs) + 4), *((const short *)(rhs) + 3), \ + *((const short *)(rhs) + 2), *((const short *)(rhs) + 1), \ + *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 4: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 3: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 2: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 1: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + } + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +//! Calculate Fused-Multiply-Add (AVX) +#define FMA_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ + ymm_sum = _mm256_fmadd_ps(ymm_m, ymm_q, ymm_sum); + +#define ACCUM_FP32_STEP_AVX FMA_FP32_AVX + +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_AVX(m, q, _RES, _LOAD, _PROC) \ + { \ + __m256i ymm_mi = _LOAD((const __m256i *)m); \ + __m256i ymm_qi = _LOAD((const __m256i *)q); \ + __m256 ymm_m = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_mi)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_qi)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + ymm_m = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_mi, 1)); \ + ymm_q = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_qi, 1)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + } + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_AVX(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, __m256, ymm_sum, _mm256_setzero_ps()) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 4) << 4); \ + if (((uintptr_t)m & 0x1f) == 0 && ((uintptr_t)q & 0x1f) == 0) { \ + for (; q != qe_aligned; m += 16, q += 16) { \ + MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_load_si256, \ + ACCUM_FP32_STEP_AVX) \ + } \ + if (qe >= qe_aligned + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + } else { \ + for (; q != qe_aligned; m += 16, q += 16) { \ + MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_loadu_si256, \ + ACCUM_FP32_STEP_AVX) \ + } \ + if (qe >= qe_aligned + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + } \ + MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \ + *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0)); + +} // namespace zvec::turbo::avx + +#endif \ No newline at end of file diff --git a/src/turbo/avx/half_float/squared_euclidean.cc b/src/turbo/avx/half_float/squared_euclidean.cc new file mode 100644 index 000000000..24913891c --- /dev/null +++ b/src/turbo/avx/half_float/squared_euclidean.cc @@ -0,0 +1,55 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/half_float/squared_euclidean.h" +#include "avx/half_float/squared_euclidean_common.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + const ailego::Float16 *lhs = reinterpret_cast(a); + const ailego::Float16 *rhs = reinterpret_cast(b); + + ACCUM_FP16_1X1_AVX(lhs, rhs, dim, distance, 0ull, ) +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX__) + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/half_float/squared_euclidean.h b/src/turbo/avx/half_float/squared_euclidean.h new file mode 100644 index 000000000..013b1f118 --- /dev/null +++ b/src/turbo/avx/half_float/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx diff --git a/src/turbo/avx/half_float/squared_euclidean_common.h b/src/turbo/avx/half_float/squared_euclidean_common.h new file mode 100644 index 000000000..8e58393d7 --- /dev/null +++ b/src/turbo/avx/half_float/squared_euclidean_common.h @@ -0,0 +1,180 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX__) + +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::avx { + +//! Mask process of computing distance (FP16) +#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC) \ + switch (cnt) { \ + case 7: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(lhs) + 6), \ + *((const short *)(lhs) + 5), *((const short *)(lhs) + 4), \ + *((const short *)(lhs) + 3), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(rhs) + 6), \ + *((const short *)(rhs) + 5), *((const short *)(rhs) + 4), \ + *((const short *)(rhs) + 3), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 6: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2), \ + *((const int *)(lhs) + 1), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2), \ + *((const int *)(rhs) + 1), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 5: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(lhs) + 4), *((const short *)(lhs) + 3), \ + *((const short *)(lhs) + 2), *((const short *)(lhs) + 1), \ + *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(rhs) + 4), *((const short *)(rhs) + 3), \ + *((const short *)(rhs) + 2), *((const short *)(rhs) + 1), \ + *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 4: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 3: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 2: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 1: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + } + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +//! Calculate sum of squared difference (AVX) +#define SSD_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ + { \ + __m256 ymm_d = _mm256_sub_ps(ymm_m, ymm_q); \ + ymm_sum = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum); \ + } + +#define ACCUM_FP32_STEP_AVX SSD_FP32_AVX + +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_AVX(m, q, _RES, _LOAD, _PROC) \ + { \ + __m256i ymm_mi = _LOAD((const __m256i *)m); \ + __m256i ymm_qi = _LOAD((const __m256i *)q); \ + __m256 ymm_m = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_mi)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_qi)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + ymm_m = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_mi, 1)); \ + ymm_q = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_qi, 1)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + } + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_AVX(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, __m256, ymm_sum, _mm256_setzero_ps()) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 4) << 4); \ + if (((uintptr_t)m & 0x1f) == 0 && ((uintptr_t)q & 0x1f) == 0) { \ + for (; q != qe_aligned; m += 16, q += 16) { \ + MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_load_si256, \ + ACCUM_FP32_STEP_AVX) \ + } \ + if (qe >= qe_aligned + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + } else { \ + for (; q != qe_aligned; m += 16, q += 16) { \ + MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_loadu_si256, \ + ACCUM_FP32_STEP_AVX) \ + } \ + if (qe >= qe_aligned + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + } \ + MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \ + *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0)); + +} // namespace zvec::turbo::avx + +#endif \ No newline at end of file diff --git a/src/turbo/avx2/half_float_converter/common.h b/src/turbo/avx2/half_float_converter/common.h new file mode 100644 index 000000000..1b05591e8 --- /dev/null +++ b/src/turbo/avx2/half_float_converter/common.h @@ -0,0 +1,26 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX2__) +#include +#include +#include + +namespace zvec::turbo::avx2::internal { + +} // namespace zvec::turbo::avx2::internal + +#endif // defined(__AVX2__) diff --git a/src/turbo/avx2/record_quantized_int4/cosine.cc b/src/turbo/avx2/record_quantized_int4/cosine.cc new file mode 100644 index 000000000..21e05b2c0 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/cosine.cc @@ -0,0 +1,95 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int4/cosine.h" +#include "avx2/record_quantized_int4/inner_product_common.h" +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + const int d = dim - 40; + const size_t original_dim = d >> 1; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + static_cast(d) * qb * mb); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX2__ +} + +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX2__) + const int d = dim - 40; + const size_t original_dim = d >> 1; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_batch_avx2(vectors, query, n, original_dim, + distances); + + const float *q_tail = reinterpret_cast( + reinterpret_cast(query) + original_dim); + float qa = q_tail[0]; + float qb = q_tail[1]; + float qs = q_tail[2]; + + for (int i = 0; i < n; ++i) { + const float *m_tail = reinterpret_cast( + reinterpret_cast(vectors[i]) + original_dim); + float ma = m_tail[0]; + float mb = m_tail[1]; + float ms = m_tail[2]; + + float &result = distances[i]; + result = -(ma * qa * result + mb * qa * qs + qb * ma * ms + + static_cast(d) * qb * mb); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int4/cosine.h b/src/turbo/avx2/record_quantized_int4/cosine.h new file mode 100644 index 000000000..77b4adad9 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized INT4 vector pair. +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int4_distance. +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int4/inner_product.cc b/src/turbo/avx2/record_quantized_int4/inner_product.cc new file mode 100644 index 000000000..e70cf2ed1 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/inner_product.cc @@ -0,0 +1,76 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int4/inner_product.h" +#include "avx2/record_quantized_int4/inner_product_common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +// Compute squared Euclidean distance between a single quantized INT4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = + -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + d * qb * mb); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif //__AVX2__ +} + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX2__) + internal::inner_product_int4_batch_avx2(vectors, query, n, dim, distances); +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int4/inner_product.h b/src/turbo/avx2/record_quantized_int4/inner_product.h new file mode 100644 index 000000000..0e9e69d63 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute inner product distance between a single quantized INT4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx2 diff --git a/src/turbo/avx2/record_quantized_int4/inner_product_common.h b/src/turbo/avx2/record_quantized_int4/inner_product_common.h new file mode 100644 index 000000000..8c96f5fb0 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/inner_product_common.h @@ -0,0 +1,250 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX2__) +#include +#include +#include +#include + +namespace zvec::turbo::avx2::internal { + + +/*! Four-bits Integer Multiplication Table + */ +static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, + 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, + 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, + 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, + 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, + 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, + 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, + 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, + 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, + 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, + 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, + 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, + 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, + 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, +}; + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT4_GENERAL(m, q, sum) \ + sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \ + Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)]; + +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f) +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) + +#define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0) +#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001) + +static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}; + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable) + +//! Compute the distance between matrix and query +#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) \ + { \ + __m128i xmm_lhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE)); \ + __m128i xmm_rhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE)); \ + __m128i xmm_lhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE)); \ + __m128i xmm_rhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE)); \ + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); \ + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); \ + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); \ + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); \ + xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), \ + ONES_INT16_SSE); \ + xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), \ + ONES_INT16_SSE); \ + xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \ + } + +#define FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) \ + { \ + __m256i ymm_lhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_lhs), MASK_INT4_AVX)); \ + __m256i ymm_rhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_rhs), MASK_INT4_AVX)); \ + __m256i ymm_lhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_lhs), 4), MASK_INT4_AVX)); \ + __m256i ymm_rhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_rhs), 4), MASK_INT4_AVX)); \ + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); \ + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); \ + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); \ + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); \ + ymm_lhs_0 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), \ + ONES_INT16_AVX); \ + ymm_lhs_1 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), \ + ONES_INT16_AVX); \ + ymm_sum = \ + _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum); \ + } + +#if defined(__SSE2__) +static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { +#ifdef __SSE3__ + __m128i x1 = _mm_hadd_epi32(v, v); + __m128i x2 = _mm_hadd_epi32(x1, x1); + return _mm_cvtsi128_si32(x2); +#else + __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); + __m128i x2 = _mm_add_epi32(v, x1); + __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); + __m128i x4 = _mm_add_epi32(x2, x3); + return _mm_cvtsi128_si32(x4); +#endif +} +#endif // __SSE2__ + +//! Compute the distance between matrix and query +static __attribute__((always_inline)) void inner_product_int4_avx2( + const void *a, const void *b, size_t size, float *distance) { + const uint8_t *lhs = reinterpret_cast(a); + const uint8_t *rhs = reinterpret_cast(b); + const uint8_t *last = lhs + size; + const uint8_t *last_aligned = lhs + ((size >> 4) << 4); + __m128i xmm_sum = _mm_setzero_si128(); + + if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } + float result = static_cast(HorizontalAdd_INT32_V128(xmm_sum)); + + switch (last - lhs) { + case 15: + FMA_INT4_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT4_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT4_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT4_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT4_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT4_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT4_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT4_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT4_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT4_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT4_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT4_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT4_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT4_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT4_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +// Compute raw integer inner products for a batch of int8 vectors against a +// single query. Uses AVX512-VNNI dpbusd instruction. +// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. +template +__attribute__((always_inline)) void inner_product_int4_batch_avx2_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) {} + +static __attribute__((always_inline)) void inner_product_int4_batch_avx2( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_int4_batch_avx2_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); + } +} + +} // namespace zvec::turbo::avx2::internal + +#endif // defined(__AVX2__) diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc new file mode 100644 index 000000000..1599a722d --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc @@ -0,0 +1,111 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int4/squared_euclidean.h" +#include "avx2/record_quantized_int4/inner_product_common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + float qs2 = a_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + float ms2 = b_tail[3]; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX2__ +} + +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX2__) + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_batch_avx2(vectors, query, n, original_dim, + distances); + + const float *q_tail = reinterpret_cast( + reinterpret_cast(query) + original_dim); + + float qa = q_tail[0]; + float qb = q_tail[1]; + float qs = q_tail[2]; + float qs2 = q_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + for (int i = 0; i < n; ++i) { + const float *m_tail = reinterpret_cast( + reinterpret_cast(vectors[i]) + original_dim); + + float ma = m_tail[0]; + float mb = m_tail[1]; + float ms = m_tail[2]; + float ms2 = m_tail[3]; + + float &result = distances[i]; + result = ma * ma * ms2 + sum2 - 2 * ma * qa * result + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean.h b/src/turbo/avx2/record_quantized_int4/squared_euclidean.h new file mode 100644 index 000000000..b6d15f698 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute squared euclidean distance between a single quantized INT4 +// vector pair. +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean INT4. +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx2 diff --git a/src/turbo/avx2/record_quantized_int8/cosine.cc b/src/turbo/avx2/record_quantized_int8/cosine.cc new file mode 100644 index 000000000..b31df0a13 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/cosine.cc @@ -0,0 +1,69 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int8/cosine.h" +#include "avx2/record_quantized_int8/inner_product_common.h" +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + const int original_dim = dim - 24; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + static_cast(original_dim) * qb * mb); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX2__ +} + +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX2__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int8/cosine.h b/src/turbo/avx2/record_quantized_int8/cosine.h new file mode 100644 index 000000000..6074ea428 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized int8 vector pair. +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int8_distance. +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int8/inner_product.cc b/src/turbo/avx2/record_quantized_int8/inner_product.cc new file mode 100644 index 000000000..4745c493a --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/inner_product.cc @@ -0,0 +1,75 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int8/inner_product.h" +#include "avx2/record_quantized_int8/inner_product_common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +// Compute squared Euclidean distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + const size_t original_dim = dim - 20; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + original_dim * qb * mb); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif //__AVX2__ +} + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX2__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int8/inner_product.h b/src/turbo/avx2/record_quantized_int8/inner_product.h new file mode 100644 index 000000000..249bafd00 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute inner product distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx2 diff --git a/src/turbo/avx2/record_quantized_int8/inner_product_common.h b/src/turbo/avx2/record_quantized_int8/inner_product_common.h new file mode 100644 index 000000000..0176f277a --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/inner_product_common.h @@ -0,0 +1,236 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX2__) +#include +#include +#include +#include + +namespace zvec::turbo::avx2::internal { + +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) +#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001) + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast(m * q); + +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +static __attribute__((always_inline)) void inner_product_int8_avx2( + const void *a, const void *b, size_t size, float *distance) { + const int8_t *lhs = reinterpret_cast(a); + const int8_t *rhs = reinterpret_cast(b); + + const int8_t *last = lhs + size; + const int8_t *last_aligned = lhs + ((size >> 6) << 6); + float result = 0.0; + + __m256i ymm_sum_0 = _mm256_setzero_si256(); + __m256i ymm_sum_1 = _mm256_setzero_si256(); + + if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m256i ymm_lhs_0 = _mm256_load_si256((const __m256i *)(lhs + 0)); + __m256i ymm_lhs_1 = _mm256_load_si256((const __m256i *)(lhs + 32)); + __m256i ymm_rhs_0 = _mm256_load_si256((const __m256i *)(rhs + 0)); + __m256i ymm_rhs_1 = _mm256_load_si256((const __m256i *)(rhs + 32)); + + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); + + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), + ONES_INT16_AVX), + ymm_sum_0); + ymm_sum_1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), + ONES_INT16_AVX), + ymm_sum_1); + } + + if (last >= last_aligned + 32) { + __m256i ymm_lhs = _mm256_load_si256((const __m256i *)lhs); + __m256i ymm_rhs = _mm256_load_si256((const __m256i *)rhs); + ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); + ymm_rhs = _mm256_abs_epi8(ymm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), + ONES_INT16_AVX), + ymm_sum_0); + lhs += 32; + rhs += 32; + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_set_m128i(_mm_setzero_si128(), + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), + ONES_INT16_SSE)), + ymm_sum_0); + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m256i ymm_lhs_0 = _mm256_loadu_si256((const __m256i *)(lhs + 0)); + __m256i ymm_lhs_1 = _mm256_loadu_si256((const __m256i *)(lhs + 32)); + __m256i ymm_rhs_0 = _mm256_loadu_si256((const __m256i *)(rhs + 0)); + __m256i ymm_rhs_1 = _mm256_loadu_si256((const __m256i *)(rhs + 32)); + + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); + + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), + ONES_INT16_AVX), + ymm_sum_0); + ymm_sum_1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), + ONES_INT16_AVX), + ymm_sum_1); + } + + if (last >= last_aligned + 32) { + __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)lhs); + __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)rhs); + ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); + ymm_rhs = _mm256_abs_epi8(ymm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), + ONES_INT16_AVX), + ymm_sum_0); + lhs += 32; + rhs += 32; + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_set_m128i(_mm_setzero_si128(), + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), + ONES_INT16_SSE)), + ymm_sum_0); + lhs += 16; + rhs += 16; + } + } + result = static_cast( + HorizontalAdd_INT32_V256(_mm256_add_epi32(ymm_sum_0, ymm_sum_1))); + + switch (last - lhs) { + case 15: + FMA_INT8_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT8_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT8_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT8_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT8_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT8_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT8_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT8_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT8_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT8_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT8_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT8_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT8_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT8_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT8_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +template +__attribute__((always_inline)) void inner_product_int8_batch_avx2_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + // TBD +} + +static __attribute__((always_inline)) void inner_product_int8_batch_avx2( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_int8_batch_avx2_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_int8_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); + } +} + +} // namespace zvec::turbo::avx2::internal + +#endif // defined(__AVX2__) diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc b/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc new file mode 100644 index 000000000..0c3c71079 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc @@ -0,0 +1,76 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int8/squared_euclidean.h" +#include "avx2/record_quantized_int8/inner_product_common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + const int original_dim = dim - 20; + if (original_dim <= 0) { + return; + } + internal::inner_product_int8_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float ma = a_tail[0]; + float mb = a_tail[1]; + float ms = a_tail[2]; + float ms2 = a_tail[3]; + + float qa = b_tail[0]; + float qb = b_tail[1]; + float qs = b_tail[2]; + float qs2 = b_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * original_dim + + 2 * (mb - qb) * (ms * ma - sum); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX2__ +} + +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX2__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean.h b/src/turbo/avx2/record_quantized_int8/squared_euclidean.h new file mode 100644 index 000000000..1bbfa6676 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute squared euclidean distance between a single quantized INT8 +// vector pair. +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean INT8. +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx2 diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h b/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h new file mode 100644 index 000000000..e460ade68 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h @@ -0,0 +1,250 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX2__) +#include +#include +#include +#include + +namespace zvec::turbo::avx2::internal { + + +/*! Four-bits Integer Multiplication Table + */ +static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, + 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, + 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, + 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, + 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, + 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, + 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, + 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, + 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, + 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, + 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, + 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, + 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, + 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, +}; + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT4_GENERAL(m, q, sum) \ + sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \ + Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)]; + +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f) +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) + +#define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0) +#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001) + +static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}; + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable) + +//! Compute the distance between matrix and query +#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) \ + { \ + __m128i xmm_lhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE)); \ + __m128i xmm_rhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE)); \ + __m128i xmm_lhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE)); \ + __m128i xmm_rhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE)); \ + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); \ + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); \ + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); \ + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); \ + xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), \ + ONES_INT16_SSE); \ + xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), \ + ONES_INT16_SSE); \ + xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \ + } + +#define FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) \ + { \ + __m256i ymm_lhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_lhs), MASK_INT4_AVX)); \ + __m256i ymm_rhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_rhs), MASK_INT4_AVX)); \ + __m256i ymm_lhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_lhs), 4), MASK_INT4_AVX)); \ + __m256i ymm_rhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_rhs), 4), MASK_INT4_AVX)); \ + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); \ + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); \ + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); \ + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); \ + ymm_lhs_0 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), \ + ONES_INT16_AVX); \ + ymm_lhs_1 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), \ + ONES_INT16_AVX); \ + ymm_sum = \ + _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum); \ + } + +#if defined(__SSE2__) +static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { +#ifdef __SSE3__ + __m128i x1 = _mm_hadd_epi32(v, v); + __m128i x2 = _mm_hadd_epi32(x1, x1); + return _mm_cvtsi128_si32(x2); +#else + __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); + __m128i x2 = _mm_add_epi32(v, x1); + __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); + __m128i x4 = _mm_add_epi32(x2, x3); + return _mm_cvtsi128_si32(x4); +#endif +} +#endif // __SSE2__ + +//! Compute the distance between matrix and query +static __attribute__((always_inline)) void squared_euclidean_int4_avx2( + const void *a, const void *b, size_t size, float *distance) { + const uint8_t *lhs = reinterpret_cast(a); + const uint8_t *rhs = reinterpret_cast(b); + const uint8_t *last = lhs + size; + const uint8_t *last_aligned = lhs + ((size >> 4) << 4); + __m128i xmm_sum = _mm_setzero_si128(); + + if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } + float result = static_cast(HorizontalAdd_INT32_V128(xmm_sum)); + + switch (last - lhs) { + case 15: + FMA_INT4_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT4_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT4_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT4_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT4_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT4_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT4_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT4_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT4_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT4_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT4_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT4_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT4_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT4_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT4_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +// Compute raw integer inner products for a batch of int8 vectors against a +// single query. Uses AVX512-VNNI dpbusd instruction. +// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. +template +__attribute__((always_inline)) void inner_product_int4_batch_avx2_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) {} + +static __attribute__((always_inline)) void inner_product_int4_batch_avx2( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_int4_batch_avx2_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); + } +} + +} // namespace zvec::turbo::avx2::internal + +#endif // defined(__AVX2__) diff --git a/src/turbo/avx512/float32/common.h b/src/turbo/avx512/float32/common.h new file mode 100644 index 000000000..af04d0e41 --- /dev/null +++ b/src/turbo/avx512/float32/common.h @@ -0,0 +1,42 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX512F__) +#include +#include +#include + +//! Calculate Fused-Multiply-Add (AVX512) +#define FMA_FP32_AVX512(zmm_m, zmm_q, zmm_sum) \ + zmm_sum = _mm512_fmadd_ps(zmm_m, zmm_q, zmm_sum); + + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +static inline float HorizontalAdd_FP32_V512(__m512 v) { + __m256 low = _mm512_castps512_ps256(v); + __m256 high = + _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)); + return HorizontalAdd_FP32_V256(_mm256_add_ps(low, high)); +} + +#endif // __AVX512F__ \ No newline at end of file diff --git a/src/turbo/avx512/float32/cosine.cc b/src/turbo/avx512/float32/cosine.cc new file mode 100644 index 000000000..3fff482c4 --- /dev/null +++ b/src/turbo/avx512/float32/cosine.cc @@ -0,0 +1,67 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512/float32/cosine.h" +#include "avx512/float32/common.h" +#include "avx512/float32/inner_product.h" + +#if defined(__AVX512F__) +#include +#endif + +namespace zvec::turbo::avx512 { + +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512F__) + constexpr size_t extra_dim = 1; + size_t d = dim - extra_dim; + + float ip; + inner_product_fp32_distance(a, b, d, &ip); + + *distance = 1 - ip; +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX512F__ +} + +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX512F__) + // `dim` is the full encoded size; the original vector occupies dim-24 bytes. + const int original_dim = dim - 1; + if (original_dim <= 0) { + return; + } + + inner_product_fp32_batch_distance(vectors, query, n, original_dim, distances); + + for (size_t i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX512F__ +} + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/float32/cosine.h b/src/turbo/avx512/float32/cosine.h new file mode 100644 index 000000000..7e11de89f --- /dev/null +++ b/src/turbo/avx512/float32/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/float32/inner_product.cc b/src/turbo/avx512/float32/inner_product.cc new file mode 100644 index 000000000..b28ef2e6a --- /dev/null +++ b/src/turbo/avx512/float32/inner_product.cc @@ -0,0 +1,104 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512/float32/inner_product.h" +#include "avx512/float32/common.h" + +#if defined(__AVX512F__) +#include +#endif + +namespace zvec::turbo::avx512 { + +// Compute squared Euclidean distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512F__) + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + dim; + const float *last_aligned = lhs + ((dim >> 5) << 5); + + __m512 zmm_sum_0 = _mm512_setzero_ps(); + __m512 zmm_sum_1 = _mm512_setzero_ps(); + + if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + FMA_FP32_AVX512(_mm512_load_ps(lhs + 0), _mm512_load_ps(rhs + 0), + zmm_sum_0) + + FMA_FP32_AVX512(_mm512_load_ps(lhs + 16), _mm512_load_ps(rhs + 16), + zmm_sum_1) + } + + if (last >= last_aligned + 16) { + FMA_FP32_AVX512(_mm512_load_ps(lhs), _mm512_load_ps(rhs), zmm_sum_0) + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + FMA_FP32_AVX512(_mm512_loadu_ps(lhs + 0), _mm512_loadu_ps(rhs + 0), + zmm_sum_0) + + FMA_FP32_AVX512(_mm512_loadu_ps(lhs + 16), _mm512_loadu_ps(rhs + 16), + zmm_sum_1) + } + + if (last >= last_aligned + 16) { + FMA_FP32_AVX512(_mm512_loadu_ps(lhs), _mm512_loadu_ps(rhs), zmm_sum_0) + lhs += 16; + rhs += 16; + } + } + + zmm_sum_0 = _mm512_add_ps(zmm_sum_0, zmm_sum_1); + if (lhs != last) { + __mmask16 mask = (__mmask16)((1 << (last - lhs)) - 1); + __m512 zmm_undefined = _mm512_undefined_ps(); + zmm_sum_0 = _mm512_mask3_fmadd_ps( + _mm512_mask_loadu_ps(zmm_undefined, mask, lhs), + _mm512_mask_loadu_ps(zmm_undefined, mask, rhs), zmm_sum_0, mask); + } + + *distance = -1 * HorizontalAdd_FP32_V512(zmm_sum_0); + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif //__AVX512F__ +} + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX512F__) + for (size_t i = 0; i < n; ++i) { + inner_product_fp32_distance(vectors[i], query, dim, &distances[i]); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX512F__ +} + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/float32/inner_product.h b/src/turbo/avx512/float32/inner_product.h new file mode 100644 index 000000000..d1f48eecf --- /dev/null +++ b/src/turbo/avx512/float32/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512 { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx512 diff --git a/src/turbo/avx512/float32/squared_euclidean.cc b/src/turbo/avx512/float32/squared_euclidean.cc new file mode 100644 index 000000000..cc00cacf9 --- /dev/null +++ b/src/turbo/avx512/float32/squared_euclidean.cc @@ -0,0 +1,105 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512/float32/squared_euclidean.h" +#include "avx512/float32/common.h" + +#if defined(__AVX512F__) +#include +#endif + +namespace zvec::turbo::avx512 { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512F__) + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + dim; + const float *last_aligned = lhs + ((dim >> 5) << 5); + + __m512 zmm_sum_0 = _mm512_setzero_ps(); + __m512 zmm_sum_1 = _mm512_setzero_ps(); + + if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + __m512 zmm_d_0 = + _mm512_sub_ps(_mm512_load_ps(lhs + 0), _mm512_load_ps(rhs + 0)); + __m512 zmm_d_1 = + _mm512_sub_ps(_mm512_load_ps(lhs + 16), _mm512_load_ps(rhs + 16)); + zmm_sum_0 = _mm512_fmadd_ps(zmm_d_0, zmm_d_0, zmm_sum_0); + zmm_sum_1 = _mm512_fmadd_ps(zmm_d_1, zmm_d_1, zmm_sum_1); + } + + if (last >= last_aligned + 16) { + __m512 zmm_d = _mm512_sub_ps(_mm512_load_ps(lhs), _mm512_load_ps(rhs)); + zmm_sum_0 = _mm512_fmadd_ps(zmm_d, zmm_d, zmm_sum_0); + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + __m512 zmm_d_0 = + _mm512_sub_ps(_mm512_loadu_ps(lhs + 0), _mm512_loadu_ps(rhs + 0)); + __m512 zmm_d_1 = + _mm512_sub_ps(_mm512_loadu_ps(lhs + 16), _mm512_loadu_ps(rhs + 16)); + zmm_sum_0 = _mm512_fmadd_ps(zmm_d_0, zmm_d_0, zmm_sum_0); + zmm_sum_1 = _mm512_fmadd_ps(zmm_d_1, zmm_d_1, zmm_sum_1); + } + + if (last >= last_aligned + 16) { + __m512 zmm_d = _mm512_sub_ps(_mm512_loadu_ps(lhs), _mm512_loadu_ps(rhs)); + zmm_sum_0 = _mm512_fmadd_ps(zmm_d, zmm_d, zmm_sum_0); + lhs += 16; + rhs += 16; + } + } + + zmm_sum_0 = _mm512_add_ps(zmm_sum_0, zmm_sum_1); + if (lhs != last) { + __mmask16 mask = (__mmask16)((1 << (last - lhs)) - 1); + __m512 zmm_undefined = _mm512_undefined_ps(); + __m512 zmm_d = _mm512_mask_sub_ps( + zmm_undefined, mask, _mm512_mask_loadu_ps(zmm_undefined, mask, lhs), + _mm512_mask_loadu_ps(zmm_undefined, mask, rhs)); + zmm_sum_0 = _mm512_mask3_fmadd_ps(zmm_d, zmm_d, zmm_sum_0, mask); + } + + *distance = HorizontalAdd_FP32_V512(zmm_sum_0); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX512F__ +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX512F__) + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp32_distance(vectors[i], query, dim, &distances[i]); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX512F__ +} + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/float32/squared_euclidean.h b/src/turbo/avx512/float32/squared_euclidean.h new file mode 100644 index 000000000..8b43b540e --- /dev/null +++ b/src/turbo/avx512/float32/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512 { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx512 diff --git a/src/turbo/avx512/half_float/cosine.cc b/src/turbo/avx512/half_float/cosine.cc new file mode 100644 index 000000000..bf08eb744 --- /dev/null +++ b/src/turbo/avx512/half_float/cosine.cc @@ -0,0 +1,66 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512/half_float/cosine.h" +#include "avx512/half_float/inner_product.h" +#include "avx512/half_float/inner_product_common.h" + +#if defined(__AVX512F__) +#include +#endif + +namespace zvec::turbo::avx512 { + +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512F__) + constexpr size_t extra_dim = 2; + size_t original_dim = dim - extra_dim; + + float ip; + inner_product_fp16_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX512F__ +} + +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX512F__) + constexpr size_t extra_dim = 2; + const size_t original_dim = dim - extra_dim; + if (original_dim <= 0) { + return; + } + + inner_product_fp16_batch_distance(vectors, query, n, original_dim, distances); + + for (size_t i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX512F__ +} + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/half_float/cosine.h b/src/turbo/avx512/half_float/cosine.h new file mode 100644 index 000000000..1e068dd6e --- /dev/null +++ b/src/turbo/avx512/half_float/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/half_float/inner_product.cc b/src/turbo/avx512/half_float/inner_product.cc new file mode 100644 index 000000000..221d0a2ab --- /dev/null +++ b/src/turbo/avx512/half_float/inner_product.cc @@ -0,0 +1,59 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__AVX512F__) +#include +#include +#include "avx512/half_float/inner_product.h" +#include "avx512/half_float/inner_product_common.h" + +using namespace zvec::turbo::avx512::internal; +#endif + +namespace zvec::turbo::avx512 { + +// Compute squared Euclidean distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512F__) + const zvec::ailego::Float16 *lhs = + reinterpret_cast(a); + const zvec::ailego::Float16 *rhs = + reinterpret_cast(b); + + ACCUM_FP16_1X1_AVX512(lhs, rhs, dim, distance, 0ull, NEGATE_FP32_GENERAL) +#endif +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX512F__) + for (size_t i = 0; i < n; ++i) { + inner_product_fp16_distance(vectors[i], query, dim, &distances[i]); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif +} + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/half_float/inner_product.h b/src/turbo/avx512/half_float/inner_product.h new file mode 100644 index 000000000..833d4c8c3 --- /dev/null +++ b/src/turbo/avx512/half_float/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512 { + +// Compute inner product distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx512 diff --git a/src/turbo/avx512/half_float/inner_product_common.h b/src/turbo/avx512/half_float/inner_product_common.h new file mode 100644 index 000000000..dcd6f2a83 --- /dev/null +++ b/src/turbo/avx512/half_float/inner_product_common.h @@ -0,0 +1,209 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX512F__) +#include +#include +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::avx512::internal { +//! Reverse sign of value (GENERAL) +#define NEGATE_FP32_GENERAL(v) -(v) + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_AVX512(m, q, _RES, _LOAD, _PROC) \ + { \ + __m512i zmm_mi = _LOAD((const __m512i *)m); \ + __m512i zmm_qi = _LOAD((const __m512i *)q); \ + __m512 zmm_m = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_mi)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_qi)); \ + _PROC(zmm_m, zmm_q, _RES##_0_0); \ + zmm_m = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_mi, 1)); \ + zmm_q = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_qi, 1)); \ + _PROC(zmm_m, zmm_q, _RES##_0_0); \ + } + +//! Mask process of computing distance (FP16) +#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC) \ + switch (cnt) { \ + case 7: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(lhs) + 6), \ + *((const short *)(lhs) + 5), *((const short *)(lhs) + 4), \ + *((const short *)(lhs) + 3), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(rhs) + 6), \ + *((const short *)(rhs) + 5), *((const short *)(rhs) + 4), \ + *((const short *)(rhs) + 3), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 6: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2), \ + *((const int *)(lhs) + 1), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2), \ + *((const int *)(rhs) + 1), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 5: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(lhs) + 4), *((const short *)(lhs) + 3), \ + *((const short *)(lhs) + 2), *((const short *)(lhs) + 1), \ + *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(rhs) + 4), *((const short *)(rhs) + 3), \ + *((const short *)(rhs) + 2), *((const short *)(rhs) + 1), \ + *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 4: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 3: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 2: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 1: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + } + +//! Calculate Fused-Multiply-Add (AVX) +#define FMA_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ + ymm_sum = _mm256_fmadd_ps(ymm_m, ymm_q, ymm_sum); + +#define ACCUM_FP32_STEP_AVX FMA_FP32_AVX + +//! Calculate Fused-Multiply-Add (AVX512) +#define FMA_FP32_AVX512(zmm_m, zmm_q, zmm_sum) \ + zmm_sum = _mm512_fmadd_ps(zmm_m, zmm_q, zmm_sum); + +#define ACCUM_FP32_STEP_AVX512 FMA_FP32_AVX512 + +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_AVX(m, q, _RES, _LOAD, _PROC) \ + { \ + __m256i ymm_mi = _LOAD((const __m256i *)m); \ + __m256i ymm_qi = _LOAD((const __m256i *)q); \ + __m256 ymm_m = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_mi)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_qi)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + ymm_m = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_mi, 1)); \ + ymm_q = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_qi, 1)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + } + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_AVX512(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, __m512, zmm_sum, _mm512_setzero_ps()) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 5) << 5); \ + if (((uintptr_t)m & 0x3f) == 0 && ((uintptr_t)q & 0x3f) == 0) { \ + for (; q != qe_aligned; m += 32, q += 32) { \ + MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_load_si512, \ + ACCUM_FP32_STEP_AVX512) \ + } \ + if (qe >= qe_aligned + 16) { \ + __m512 zmm_m = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)m)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)q)); \ + ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0) \ + m += 16; \ + q += 16; \ + } \ + } else { \ + for (; q != qe_aligned; m += 32, q += 32) { \ + MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_loadu_si512, \ + ACCUM_FP32_STEP_AVX512) \ + } \ + if (qe >= qe_aligned + 16) { \ + __m512 zmm_m = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)m)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)q)); \ + ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0) \ + m += 16; \ + q += 16; \ + } \ + } \ + __m256 ymm_sum_0_0 = _mm256_add_ps(_mm512_castps512_ps256(zmm_sum_0_0), \ + _mm256_castpd_ps(_mm512_extractf64x4_pd( \ + _mm512_castps_pd(zmm_sum_0_0), 1))); \ + if (qe >= q + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \ + *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0)); + +} // namespace zvec::turbo::avx512::internal + +#endif // defined(__AVX512F__) diff --git a/src/turbo/avx512/half_float/squared_euclidean.cc b/src/turbo/avx512/half_float/squared_euclidean.cc new file mode 100644 index 000000000..7a4b18e11 --- /dev/null +++ b/src/turbo/avx512/half_float/squared_euclidean.cc @@ -0,0 +1,61 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__AVX512F__) +#include +#include +#include "avx512/half_float/squared_euclidean.h" +#include "avx512/half_float/squared_euclidean_common.h" + +using namespace zvec::turbo::avx512::internal; +#endif + +namespace zvec::turbo::avx512 { + +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512F__) + const zvec::ailego::Float16 *lhs = + reinterpret_cast(a); + const zvec::ailego::Float16 *rhs = + reinterpret_cast(b); + + ACCUM_FP16_1X1_AVX512(lhs, rhs, dim, distance, 0ull, ) +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX512F__ +} + +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX512F__) + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX512F__ +} + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/half_float/squared_euclidean.h b/src/turbo/avx512/half_float/squared_euclidean.h new file mode 100644 index 000000000..399e238b0 --- /dev/null +++ b/src/turbo/avx512/half_float/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512 { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx512 diff --git a/src/turbo/avx512/half_float/squared_euclidean_common.h b/src/turbo/avx512/half_float/squared_euclidean_common.h new file mode 100644 index 000000000..6ff8c4254 --- /dev/null +++ b/src/turbo/avx512/half_float/squared_euclidean_common.h @@ -0,0 +1,200 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX512F__) +#include +#include +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::avx512::internal { + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_AVX512(m, q, _RES, _LOAD, _PROC) \ + { \ + __m512i zmm_mi = _LOAD((const __m512i *)m); \ + __m512i zmm_qi = _LOAD((const __m512i *)q); \ + __m512 zmm_m = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_mi)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_qi)); \ + _PROC(zmm_m, zmm_q, _RES##_0_0); \ + zmm_m = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_mi, 1)); \ + zmm_q = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_qi, 1)); \ + _PROC(zmm_m, zmm_q, _RES##_0_0); \ + } + +//! Mask process of computing distance (FP16) +#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC) \ + switch (cnt) { \ + case 7: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(lhs) + 6), \ + *((const short *)(lhs) + 5), *((const short *)(lhs) + 4), \ + *((const short *)(lhs) + 3), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(rhs) + 6), \ + *((const short *)(rhs) + 5), *((const short *)(rhs) + 4), \ + *((const short *)(rhs) + 3), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 6: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2), \ + *((const int *)(lhs) + 1), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2), \ + *((const int *)(rhs) + 1), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 5: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(lhs) + 4), *((const short *)(lhs) + 3), \ + *((const short *)(lhs) + 2), *((const short *)(lhs) + 1), \ + *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(rhs) + 4), *((const short *)(rhs) + 3), \ + *((const short *)(rhs) + 2), *((const short *)(rhs) + 1), \ + *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 4: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 3: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 2: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 1: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + } + +//! Calculate sum of squared difference (AVX) +#define SSD_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ + { \ + __m256 ymm_d = _mm256_sub_ps(ymm_m, ymm_q); \ + ymm_sum = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum); \ + } + +#define ACCUM_FP32_STEP_AVX SSD_FP32_AVX + +//! Calculate sum of squared difference (AVX512) +#define SSD_FP32_AVX512(zmm_m, zmm_q, zmm_sum) \ + { \ + __m512 zmm_d = _mm512_sub_ps(zmm_m, zmm_q); \ + zmm_sum = _mm512_fmadd_ps(zmm_d, zmm_d, zmm_sum); \ + } + +#define ACCUM_FP32_STEP_AVX512 SSD_FP32_AVX512 + +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_AVX512(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, __m512, zmm_sum, _mm512_setzero_ps()) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 5) << 5); \ + if (((uintptr_t)m & 0x3f) == 0 && ((uintptr_t)q & 0x3f) == 0) { \ + for (; q != qe_aligned; m += 32, q += 32) { \ + MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_load_si512, \ + ACCUM_FP32_STEP_AVX512) \ + } \ + if (qe >= qe_aligned + 16) { \ + __m512 zmm_m = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)m)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)q)); \ + ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0) \ + m += 16; \ + q += 16; \ + } \ + } else { \ + for (; q != qe_aligned; m += 32, q += 32) { \ + MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_loadu_si512, \ + ACCUM_FP32_STEP_AVX512) \ + } \ + if (qe >= qe_aligned + 16) { \ + __m512 zmm_m = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)m)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)q)); \ + ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0) \ + m += 16; \ + q += 16; \ + } \ + } \ + __m256 ymm_sum_0_0 = _mm256_add_ps(_mm512_castps512_ps256(zmm_sum_0_0), \ + _mm256_castpd_ps(_mm512_extractf64x4_pd( \ + _mm512_castps_pd(zmm_sum_0_0), 1))); \ + if (qe >= q + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \ + *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0)); + +} // namespace zvec::turbo::avx512::internal + +#endif // defined(__AVX512F__) diff --git a/src/turbo/avx512_fp16/half_float/cosine.cc b/src/turbo/avx512_fp16/half_float/cosine.cc new file mode 100644 index 000000000..a5404712a --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/cosine.cc @@ -0,0 +1,66 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512_fp16/half_float/cosine.h" +#include "avx512_fp16/half_float/inner_product.h" +#include "avx512_fp16/half_float/inner_product_common.h" + +#if defined(__AVX512FP16__) +#include +#endif + +namespace zvec::turbo::avx512_fp16 { + +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512FP16__) + constexpr size_t extra_dim = 2; + size_t original_dim = dim - extra_dim; + + float ip; + inner_product_fp16_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX512FP16__) + constexpr size_t extra_dim = 2; + const size_t original_dim = dim - extra_dim; + if (original_dim <= 0) { + return; + } + + inner_product_fp16_batch_distance(vectors, query, n, original_dim, distances); + + for (size_t i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX__ +} + +} // namespace zvec::turbo::avx512_fp16 \ No newline at end of file diff --git a/src/turbo/avx512_fp16/half_float/cosine.h b/src/turbo/avx512_fp16/half_float/cosine.h new file mode 100644 index 000000000..2b57bcf9e --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512_fp16 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx512_fp16 \ No newline at end of file diff --git a/src/turbo/avx512_fp16/half_float/inner_product.cc b/src/turbo/avx512_fp16/half_float/inner_product.cc new file mode 100644 index 000000000..c7262577d --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/inner_product.cc @@ -0,0 +1,112 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__AVX512FP16__) +#include +#include +#include "avx512_fp16/half_float/inner_product.h" +#include "avx512_fp16/half_float/inner_product_common.h" + +using namespace zvec::ailego; + +using namespace zvec::turbo::avx512_fp16::internal; + +#endif + +namespace zvec::turbo::avx512_fp16 { + +// Compute squared Euclidean distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512FP16__) + const Float16 *lhs = reinterpret_cast(a); + const Float16 *rhs = reinterpret_cast(b); + + const Float16 *last = lhs + dim; + const Float16 *last_aligned = lhs + ((dim >> 6) << 6); + + __m512h zmm_sum_0 = _mm512_setzero_ph(); + __m512h zmm_sum_1 = _mm512_setzero_ph(); + + if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + FMA_FP16_AVX512FP16(_mm512_load_ph(lhs + 0), _mm512_load_ph(rhs + 0), + zmm_sum_0) + + FMA_FP16_AVX512FP16(_mm512_load_ph(lhs + 32), _mm512_load_ph(rhs + 32), + zmm_sum_1) + } + + if (last >= last_aligned + 32) { + FMA_FP16_AVX512FP16(_mm512_load_ph(lhs), _mm512_load_ph(rhs), zmm_sum_0) + lhs += 32; + rhs += 32; + } + } else { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + FMA_FP16_AVX512FP16(_mm512_loadu_ph(lhs + 0), _mm512_loadu_ph(rhs + 0), + zmm_sum_0) + + FMA_FP16_AVX512FP16(_mm512_loadu_ph(lhs + 32), _mm512_loadu_ph(rhs + 32), + zmm_sum_1) + } + + if (last >= last_aligned + 32) { + FMA_FP16_AVX512FP16(_mm512_loadu_ph(lhs), _mm512_loadu_ph(rhs), zmm_sum_0) + lhs += 32; + rhs += 32; + } + } + + zmm_sum_0 = _mm512_add_ph(zmm_sum_0, zmm_sum_1); + + if (lhs != last) { + __mmask32 mask = (__mmask32)((1 << (last - lhs)) - 1); + __m512i zmm_undefined = _mm512_undefined_epi32(); + zmm_sum_0 = _mm512_mask3_fmadd_ph( + _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, lhs)), + _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, rhs)), + zmm_sum_0, mask); + } + + *distance = -1 * HorizontalAdd_FP16_V512(zmm_sum_0); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX512FP16__) + for (size_t i = 0; i < n; ++i) { + inner_product_fp16_distance(vectors[i], query, dim, &distances[i]); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif // __AVX512FP16__ +} + +} // namespace zvec::turbo::avx512_fp16 \ No newline at end of file diff --git a/src/turbo/avx512_fp16/half_float/inner_product.h b/src/turbo/avx512_fp16/half_float/inner_product.h new file mode 100644 index 000000000..a80944713 --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512_fp16 { + +// Compute inner product distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx512_fp16 diff --git a/src/turbo/avx512_fp16/half_float/inner_product_common.h b/src/turbo/avx512_fp16/half_float/inner_product_common.h new file mode 100644 index 000000000..30921e038 --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/inner_product_common.h @@ -0,0 +1,53 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX512FP16__) +#include +#include +#include + +namespace zvec::turbo::avx512_fp16::internal { + +//! Calculate Fused-Multiply-Add (AVX512FP16) +#define FMA_FP16_AVX512FP16(zmm_m, zmm_q, zmm_sum) \ + zmm_sum = _mm512_fmadd_ph(zmm_m, zmm_q, zmm_sum); + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +static inline float HorizontalAdd_FP32_V512(__m512 v) { + __m256 low = _mm512_castps512_ps256(v); + __m256 high = + _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)); + return HorizontalAdd_FP32_V256(_mm256_add_ps(low, high)); +} + +static inline float HorizontalAdd_FP16_V512(__m512h v) { + __m512 low = _mm512_cvtxph_ps(_mm512_castph512_ph256(v)); + __m512 high = _mm512_cvtxph_ps( + _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(v), 1))); + + return HorizontalAdd_FP32_V512(_mm512_add_ps(low, high)); +} + +} // namespace zvec::turbo::avx512_fp16::internal + +#endif // defined(__AVX512FP16__) diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean.cc b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc new file mode 100644 index 000000000..5e33255b3 --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc @@ -0,0 +1,114 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__AVX512F__) +#include +#include +#include "avx512_fp16/half_float/squared_euclidean.h" +#include "avx512_fp16/half_float/squared_euclidean_common.h" + +using namespace zvec::ailego; + +using namespace zvec::turbo::avx512_fp16::internal; + +#endif + +namespace zvec::turbo::avx512_fp16 { + +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512FP16__) + const Float16 *lhs = reinterpret_cast(a); + const Float16 *rhs = reinterpret_cast(b); + + const Float16 *last = lhs + dim; + const Float16 *last_aligned = lhs + ((dim >> 6) << 6); + + __m512h zmm_sum_0 = _mm512_setzero_ph(); + __m512h zmm_sum_1 = _mm512_setzero_ph(); + + if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m512h zmm_d_0 = + _mm512_sub_ph(_mm512_load_ph(lhs + 0), _mm512_load_ph(rhs + 0)); + __m512h zmm_d_1 = + _mm512_sub_ph(_mm512_load_ph(lhs + 32), _mm512_load_ph(rhs + 32)); + zmm_sum_0 = _mm512_fmadd_ph(zmm_d_0, zmm_d_0, zmm_sum_0); + zmm_sum_1 = _mm512_fmadd_ph(zmm_d_1, zmm_d_1, zmm_sum_1); + } + + if (last >= last_aligned + 32) { + __m512h zmm_d = _mm512_sub_ph(_mm512_load_ph(lhs), _mm512_load_ph(rhs)); + zmm_sum_0 = _mm512_fmadd_ph(zmm_d, zmm_d, zmm_sum_0); + lhs += 32; + rhs += 32; + } + } else { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m512h zmm_d_0 = + _mm512_sub_ph(_mm512_loadu_ph(lhs + 0), _mm512_loadu_ph(rhs + 0)); + __m512h zmm_d_1 = + _mm512_sub_ph(_mm512_loadu_ph(lhs + 32), _mm512_loadu_ph(rhs + 32)); + zmm_sum_0 = _mm512_fmadd_ph(zmm_d_0, zmm_d_0, zmm_sum_0); + zmm_sum_1 = _mm512_fmadd_ph(zmm_d_1, zmm_d_1, zmm_sum_1); + } + + if (last >= last_aligned + 32) { + __m512h zmm_d = _mm512_sub_ph(_mm512_loadu_ph(lhs), _mm512_loadu_ph(rhs)); + zmm_sum_0 = _mm512_fmadd_ph(zmm_d, zmm_d, zmm_sum_0); + lhs += 32; + rhs += 32; + } + } + + zmm_sum_0 = _mm512_add_ph(zmm_sum_0, zmm_sum_1); + if (lhs != last) { + __mmask32 mask = (__mmask32)((1 << (last - lhs)) - 1); + __m512i zmm_undefined = _mm512_undefined_epi32(); + __m512h zmm_undefined_ph = _mm512_undefined_ph(); + __m512h zmm_d = _mm512_mask_sub_ph( + zmm_undefined_ph, mask, + _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, lhs)), + _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, rhs))); + zmm_sum_0 = _mm512_mask3_fmadd_ph(zmm_d, zmm_d, zmm_sum_0, mask); + } + + *distance = HorizontalAdd_FP16_V512(zmm_sum_0); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX512FP16__ +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX512FP16__) + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX512FP16__ +} + +} // namespace zvec::turbo::avx512_fp16 \ No newline at end of file diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean.h b/src/turbo/avx512_fp16/half_float/squared_euclidean.h new file mode 100644 index 000000000..669749f51 --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512_fp16 { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx512_fp16 diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h b/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h new file mode 100644 index 000000000..b5f91988e --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h @@ -0,0 +1,49 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX512FP16__) +#include +#include +#include + +namespace zvec::turbo::avx512_fp16::internal { + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +static inline float HorizontalAdd_FP32_V512(__m512 v) { + __m256 low = _mm512_castps512_ps256(v); + __m256 high = + _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)); + return HorizontalAdd_FP32_V256(_mm256_add_ps(low, high)); +} + +static inline float HorizontalAdd_FP16_V512(__m512h v) { + __m512 low = _mm512_cvtxph_ps(_mm512_castph512_ph256(v)); + __m512 high = _mm512_cvtxph_ps( + _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(v), 1))); + + return HorizontalAdd_FP32_V512(_mm512_add_ps(low, high)); +} + +} // namespace zvec::turbo::avx512_fp16::internal + +#endif // defined(__AVX512FP16__) diff --git a/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc new file mode 100644 index 000000000..db83b128a --- /dev/null +++ b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc @@ -0,0 +1,65 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512_vnni/record_quantized_int8/inner_product.h" +#include +#include "avx512_vnni/record_quantized_int8/common.h" + +namespace zvec::turbo::avx512_vnni { + +// Compute squared Euclidean distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { + const size_t original_dim = dim - 20; + + if (original_dim <= 0) { + return; + } + + internal::ip_int8_avx512_vnni(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + original_dim * qb * mb); +} + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX512VNNI__) + internal::ip_int8_batch_avx512_vnni(vectors, query, n, dim, distances); +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif // __AVX512VNNI__ +} + +} // namespace zvec::turbo::avx512_vnni \ No newline at end of file diff --git a/src/turbo/avx512_vnni/record_quantized_int8/inner_product.h b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.h new file mode 100644 index 000000000..25f0ce109 --- /dev/null +++ b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512_vnni { + +// Compute inner product distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx512_vnni diff --git a/src/turbo/quantizer/RecordInt4Quantizer/record_int4_quantizer.cc b/src/turbo/quantizer/RecordInt4Quantizer/record_int4_quantizer.cc new file mode 100644 index 000000000..e69de29bb diff --git a/src/turbo/quantizer/RecordInt8Quantizer/record_int8_quantizer.cc b/src/turbo/quantizer/RecordInt8Quantizer/record_int8_quantizer.cc new file mode 100644 index 000000000..72617e56b --- /dev/null +++ b/src/turbo/quantizer/RecordInt8Quantizer/record_int8_quantizer.cc @@ -0,0 +1,21 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#pragma once + +namespace zvec { +namespace turbo {} // namespace turbo +} // namespace zvec \ No newline at end of file diff --git a/src/turbo/quantizer/RecordInt8Quantizer/reocrd_int8_quantier.h b/src/turbo/quantizer/RecordInt8Quantizer/reocrd_int8_quantier.h new file mode 100644 index 000000000..8e083ae25 --- /dev/null +++ b/src/turbo/quantizer/RecordInt8Quantizer/reocrd_int8_quantier.h @@ -0,0 +1,48 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#pragma once + +namespace zvec { +namespace turbo { + +class RecordInt8Quantizer : public Quantizer { + public: + RecordInt8Quantizer() : type_{QuantizeType::kRecordInt8} {} + + virtual ~RecordInt8Quantizer() {} + + public: + QuantizeType type() const override { + return type_; + } + + const IndexMeta &meta(void) const override { + return meta_; + } + + private: + IndexMeta meta_{}; + IndexHolder::Pointer holder_{}; + std::shared_ptr quantizer_{}; + Stats stats_{}; + IndexMeta::DataType data_type_{}; +}; + + +} // namespace turbo +} // namespace zvec diff --git a/src/turbo/quantizer/quantizer.h b/src/turbo/quantizer/quantizer.h new file mode 100644 index 000000000..b051a6d87 --- /dev/null +++ b/src/turbo/quantizer/quantizer.h @@ -0,0 +1,33 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#pragma once + +namespace zvec { +namespace turbo { + +class Quantizer { + public: + Quantizer() {}; + virtual ~Quantizer() {}; + + private: + QuantizeType type_{QuantizeType::kDefault}; +}; + +} // namespace turbo +} // namespace zvec diff --git a/src/turbo/scalar/float32/cosine.cc b/src/turbo/scalar/float32/cosine.cc new file mode 100644 index 000000000..cffb0b166 --- /dev/null +++ b/src/turbo/scalar/float32/cosine.cc @@ -0,0 +1,39 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/float32/cosine.h" +#include "scalar/float32/inner_product.h" + +namespace zvec::turbo::scalar { + +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { + constexpr size_t extra_dim = 1; + size_t original_dim = dim - extra_dim; + + float ip; + inner_product_fp32_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; +} + +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { + inner_product_fp32_batch_distance(vectors, query, n, dim, distances); + for (size_t i = 0; i < n; i++) { + distances[i] = 1 - distances[i]; + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float32/cosine.h b/src/turbo/scalar/float32/cosine.h new file mode 100644 index 000000000..b5e4f4eee --- /dev/null +++ b/src/turbo/scalar/float32/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float32/inner_product.cc b/src/turbo/scalar/float32/inner_product.cc new file mode 100644 index 000000000..23a282ef3 --- /dev/null +++ b/src/turbo/scalar/float32/inner_product.cc @@ -0,0 +1,43 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/float32/inner_product.h" + +namespace zvec::turbo::scalar { + +// Compute squared Euclidean distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { + const float *m = reinterpret_cast(a); + const float *q = reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + sum += static_cast(m[i] * q[i]); + } + + *distance = -sum; +} + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + for (size_t i = 0; i < n; ++i) { + inner_product_fp32_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float32/inner_product.h b/src/turbo/scalar/float32/inner_product.h new file mode 100644 index 000000000..d4e03418e --- /dev/null +++ b/src/turbo/scalar/float32/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/float32/squared_euclidean.cc b/src/turbo/scalar/float32/squared_euclidean.cc new file mode 100644 index 000000000..a3ffd10bb --- /dev/null +++ b/src/turbo/scalar/float32/squared_euclidean.cc @@ -0,0 +1,41 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/float32/squared_euclidean.h" +#include + +namespace zvec::turbo::scalar { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { + const float *m = reinterpret_cast(a); + const float *q = reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + sum += zvec::ailego::MathHelper::SquaredDifference(m[i], q[i]); + } + + *distance = sum; +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp32_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float32/squared_euclidean.h b/src/turbo/scalar/float32/squared_euclidean.h new file mode 100644 index 000000000..bf319c1d2 --- /dev/null +++ b/src/turbo/scalar/float32/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/half_float/cosine.cc b/src/turbo/scalar/half_float/cosine.cc new file mode 100644 index 000000000..3c7a39550 --- /dev/null +++ b/src/turbo/scalar/half_float/cosine.cc @@ -0,0 +1,38 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/half_float/cosine.h" +#include "scalar/half_float/inner_product.h" + +namespace zvec::turbo::scalar { + +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { + constexpr size_t extra_dim = 2; + size_t original_dim = dim - extra_dim; + + float ip; + inner_product_fp16_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; +} + +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { + for (size_t i = 0; i < n; ++i) { + cosine_fp16_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/half_float/cosine.h b/src/turbo/scalar/half_float/cosine.h new file mode 100644 index 000000000..cb82bc893 --- /dev/null +++ b/src/turbo/scalar/half_float/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP16 vector pair. +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp16_distance. +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/half_float/inner_product.cc b/src/turbo/scalar/half_float/inner_product.cc new file mode 100644 index 000000000..d06c45b25 --- /dev/null +++ b/src/turbo/scalar/half_float/inner_product.cc @@ -0,0 +1,46 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/half_float/inner_product.h" +#include + +namespace zvec::turbo::scalar { + +// Compute squared Euclidean distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { + const zvec::ailego::Float16 *m = + reinterpret_cast(a); + const zvec::ailego::Float16 *q = + reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + sum += static_cast(m[i] * q[i]); + } + + *distance = -sum; +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + for (size_t i = 0; i < n; ++i) { + inner_product_fp16_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/half_float/inner_product.h b/src/turbo/scalar/half_float/inner_product.h new file mode 100644 index 000000000..98fc4cba4 --- /dev/null +++ b/src/turbo/scalar/half_float/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute inner product distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/half_float/squared_euclidean.cc b/src/turbo/scalar/half_float/squared_euclidean.cc new file mode 100644 index 000000000..c3f6b3c2e --- /dev/null +++ b/src/turbo/scalar/half_float/squared_euclidean.cc @@ -0,0 +1,43 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/half_float/squared_euclidean.h" +#include + +namespace zvec::turbo::scalar { + +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { + const zvec::ailego::Float16 *m = + reinterpret_cast(a); + const zvec::ailego::Float16 *q = + reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + sum += zvec::ailego::MathHelper::SquaredDifference(m[i], q[i]); + } + + *distance = sum; +} + +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/half_float/squared_euclidean.h b/src/turbo/scalar/half_float/squared_euclidean.h new file mode 100644 index 000000000..8865cd1c2 --- /dev/null +++ b/src/turbo/scalar/half_float/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute squared euclidean distance between a single quantized FP16 +// vector pair. +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/record_quantized_int4/common.h b/src/turbo/scalar/record_quantized_int4/common.h new file mode 100644 index 000000000..f4b74d7d3 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/common.h @@ -0,0 +1,59 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace zvec::turbo::scalar::internal { + +/*! Four-bits Integer Multiplication Table + */ +static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, + 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, + 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, + 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, + 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, + 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, + 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, + 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, + 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, + 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, + 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, + 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, + 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, + 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, +}; + +static __attribute__((always_inline)) void inner_product_int4_scalar( + const void *a, const void *b, size_t dim, float *distance) { + const uint8_t *m = reinterpret_cast(a); + const uint8_t *q = reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + uint8_t m_val = m[i]; + uint8_t q_val = q[i]; + sum += Int4MulTable[((m_val << 4) & 0xf0) | ((q_val >> 0) & 0xf)] + + Int4MulTable[((m_val >> 0) & 0xf0) | ((q_val >> 4) & 0xf)]; + } + + *distance = sum; +} + +} // namespace zvec::turbo::scalar::internal \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/cosine.cc b/src/turbo/scalar/record_quantized_int4/cosine.cc new file mode 100644 index 000000000..cab09202d --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/cosine.cc @@ -0,0 +1,55 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int4/cosine.h" +#include "scalar/record_quantized_int4/common.h" + +namespace zvec::turbo::scalar { + +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { + const int d = dim - 40; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_scalar(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + static_cast(d) * qb * mb); +} + +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { + for (size_t i = 0; i < n; ++i) { + cosine_int4_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/cosine.h b/src/turbo/scalar/record_quantized_int4/cosine.h new file mode 100644 index 000000000..25838aa02 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized int4 vector pair. +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int4_distance. +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/inner_product.cc b/src/turbo/scalar/record_quantized_int4/inner_product.cc new file mode 100644 index 000000000..02bdec849 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/inner_product.cc @@ -0,0 +1,59 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int4/inner_product.h" +#include "scalar/record_quantized_int4/common.h" + +namespace zvec::turbo::scalar { + +// Compute squared Euclidean distance between a single quantized int4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_scalar(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = + -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + d * qb * mb); +} + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + for (size_t i = 0; i < n; ++i) { + inner_product_int4_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/inner_product.h b/src/turbo/scalar/record_quantized_int4/inner_product.h new file mode 100644 index 000000000..b34d47aa4 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute inner product distance between a single quantized int4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc new file mode 100644 index 000000000..555f96246 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc @@ -0,0 +1,61 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int4/squared_euclidean.h" +#include "scalar/record_quantized_int4/common.h" + +namespace zvec::turbo::scalar { + +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_scalar(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + float qs2 = a_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + float ms2 = b_tail[3]; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); +} + +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { + for (size_t i = 0; i < n; ++i) { + squared_euclidean_int4_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/squared_euclidean.h b/src/turbo/scalar/record_quantized_int4/squared_euclidean.h new file mode 100644 index 000000000..ea37cfdec --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute squared euclidean distance between a single quantized INT8 +// vector pair. +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean INT8. +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/record_quantized_int8/common.h b/src/turbo/scalar/record_quantized_int8/common.h new file mode 100644 index 000000000..d0b7186ae --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/common.h @@ -0,0 +1,34 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar::internal { + +static __attribute__((always_inline)) void inner_product_int8_scalar( + const void *a, const void *b, size_t dim, float *distance) { + const int8_t *m = reinterpret_cast(a); + const int8_t *q = reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + sum += static_cast(m[i] * q[i]); + } + + *distance = -sum; +} + +} // namespace zvec::turbo::scalar::internal diff --git a/src/turbo/scalar/record_quantized_int8/cosine.cc b/src/turbo/scalar/record_quantized_int8/cosine.cc new file mode 100644 index 000000000..fe5faf8e7 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/cosine.cc @@ -0,0 +1,56 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int8/cosine.h" +#include +#include "scalar/record_quantized_int8/common.h" + +namespace zvec::turbo::scalar { + +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { + const int original_dim = dim - 24; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_scalar(a, b, original_dim, distance); + *distance = -*distance; + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + original_dim * qb * mb); +} + +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { + for (size_t i = 0; i < n; ++i) { + cosine_int8_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/cosine.h b/src/turbo/scalar/record_quantized_int8/cosine.h new file mode 100644 index 000000000..e06d8b234 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized int8 vector pair. +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int8_distance. +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/inner_product.cc b/src/turbo/scalar/record_quantized_int8/inner_product.cc new file mode 100644 index 000000000..e33cdac12 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/inner_product.cc @@ -0,0 +1,61 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int8/inner_product.h" +#include +#include "scalar/record_quantized_int8/common.h" + +namespace zvec::turbo::scalar { + +// Compute squared Euclidean distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { + const size_t original_dim = dim - 20; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_scalar(a, b, original_dim, distance); + + *distance = -1 * *distance; + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + original_dim * qb * mb); +} + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + for (size_t i = 0; i < n; ++i) { + inner_product_int8_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/inner_product.h b/src/turbo/scalar/record_quantized_int8/inner_product.h new file mode 100644 index 000000000..1ed51489a --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute inner product distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc new file mode 100644 index 000000000..d05d1a049 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc @@ -0,0 +1,61 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int8/squared_euclidean.h" +#include "scalar/record_quantized_int8/common.h" + +namespace zvec::turbo::scalar { + +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { + const int original_dim = dim - 20; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_scalar(a, b, original_dim, distance); + *distance = -*distance; + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float ma = a_tail[0]; + float mb = a_tail[1]; + float ms = a_tail[2]; + float ms2 = a_tail[3]; + + float qa = b_tail[0]; + float qb = b_tail[1]; + float qs = b_tail[2]; + float qs2 = b_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * original_dim + + 2 * (mb - qb) * (ms * ma - sum); +} + +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { + for (size_t i = 0; i < n; ++i) { + squared_euclidean_int8_distance(vectors[i], query, dim, &distances[i]); + } +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/squared_euclidean.h b/src/turbo/scalar/record_quantized_int8/squared_euclidean.h new file mode 100644 index 000000000..07db60519 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute squared euclidean distance between a single quantized INT8 +// vector pair. +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean INT8. +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/sse/record_quantized_int4/common.h b/src/turbo/sse/record_quantized_int4/common.h new file mode 100644 index 000000000..623d6365a --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/common.h @@ -0,0 +1,174 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__SSE4_1__) +#include +#include +#include +#include + +namespace zvec::turbo::sse::internal { + +//! Four-bits Convert Table +static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}; + +/*! Four-bits Integer Multiplication Table + */ +static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, + 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, + 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, + 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, + 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, + 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, + 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, + 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, + 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, + 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, + 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, + 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, + 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, + 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, +}; + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT4_GENERAL(m, q, sum) \ + sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \ + Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)]; + +#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f) +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) +#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable) + +//! Compute the distance between matrix and query +#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) \ + { \ + __m128i xmm_lhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE)); \ + __m128i xmm_rhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE)); \ + __m128i xmm_lhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE)); \ + __m128i xmm_rhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE)); \ + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); \ + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); \ + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); \ + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); \ + xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), \ + ONES_INT16_SSE); \ + xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), \ + ONES_INT16_SSE); \ + xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \ + } + +static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { +#ifdef __SSE3__ + __m128i x1 = _mm_hadd_epi32(v, v); + __m128i x2 = _mm_hadd_epi32(x1, x1); + return _mm_cvtsi128_si32(x2); +#else + __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); + __m128i x2 = _mm_add_epi32(v, x1); + __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); + __m128i x4 = _mm_add_epi32(x2, x3); + return _mm_cvtsi128_si32(x4); +#endif +} + +static __attribute__((always_inline)) void inner_product_int4_sse( + const void *a, const void *b, size_t size, float *distance) { + const uint8_t *lhs = reinterpret_cast(a); + const uint8_t *rhs = reinterpret_cast(b); + + const uint8_t *last = lhs + size; + const uint8_t *last_aligned = lhs + ((size >> 4) << 4); + __m128i xmm_sum = _mm_setzero_si128(); + + if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } + float result = static_cast(HorizontalAdd_INT32_V128(xmm_sum)); + + switch (last - lhs) { + case 15: + FMA_INT4_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT4_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT4_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT4_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT4_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT4_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT4_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT4_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT4_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT4_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT4_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT4_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT4_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT4_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT4_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +} // namespace zvec::turbo::sse::internal + +#endif // defined(__SSE4_1__) diff --git a/src/turbo/sse/record_quantized_int4/cosine.cc b/src/turbo/sse/record_quantized_int4/cosine.cc new file mode 100644 index 000000000..5751e511d --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/cosine.cc @@ -0,0 +1,70 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sse/record_quantized_int4/cosine.h" +#include "sse/record_quantized_int4/common.h" +#if defined(__SSE4_1__) +#include +#endif + +namespace zvec::turbo::sse { + +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE4_1__) + const int d = dim - 40; + const size_t original_dim = d >> 1; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + static_cast(d) * qb * mb); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __SSE4_1__ +} + +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__SSE4_1__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__SSE4_1__ +} + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/cosine.h b/src/turbo/sse/record_quantized_int4/cosine.h new file mode 100644 index 000000000..87306a06e --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized INT4 vector pair. +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int4_distance. +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/inner_product.cc b/src/turbo/sse/record_quantized_int4/inner_product.cc new file mode 100644 index 000000000..47121a668 --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/inner_product.cc @@ -0,0 +1,76 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sse/record_quantized_int4/inner_product.h" +#include "sse/record_quantized_int4/common.h" + +#if defined(__SSE4_1__) +#include +#endif + +namespace zvec::turbo::sse { + +// Compute squared inner product distance between a single quantized INT4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE4_1__) + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = + -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + d * qb * mb); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif //__SSE4_1__ +} + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__SSE4_1__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__SSE4_1__ +} + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/inner_product.h b/src/turbo/sse/record_quantized_int4/inner_product.h new file mode 100644 index 000000000..4ee508ed2 --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute inner product distance between a single quantized INT4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::sse diff --git a/src/turbo/sse/record_quantized_int4/squared_euclidean.cc b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc new file mode 100644 index 000000000..59155e2f3 --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc @@ -0,0 +1,78 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sse/record_quantized_int4/squared_euclidean.h" +#include "sse/record_quantized_int4/common.h" + +#if defined(__SSE4_1__) +#include +#endif + +namespace zvec::turbo::sse { + +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE4_1__) + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + float qs2 = a_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + float ms2 = b_tail[3]; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __SSE4_1__ +} + +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__SSE4_1__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__SSE4_1__ +} + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/squared_euclidean.h b/src/turbo/sse/record_quantized_int4/squared_euclidean.h new file mode 100644 index 000000000..3cff9f99b --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute squared euclidean distance between a single quantized INT4 +// vector pair. +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean INT4. +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::sse diff --git a/src/turbo/sse/record_quantized_int8/common.h b/src/turbo/sse/record_quantized_int8/common.h new file mode 100644 index 000000000..b48b2598e --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/common.h @@ -0,0 +1,210 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__SSE__) +#include +#include +#include +#include + +namespace zvec::turbo::sse::internal { + +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) + +static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { +#ifdef __SSE3__ + __m128i x1 = _mm_hadd_epi32(v, v); + __m128i x2 = _mm_hadd_epi32(x1, x1); + return _mm_cvtsi128_si32(x2); +#else + __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); + __m128i x2 = _mm_add_epi32(v, x1); + __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); + __m128i x4 = _mm_add_epi32(x2, x3); + return _mm_cvtsi128_si32(x4); +#endif +} + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast(m * q); + +static __attribute__((always_inline)) void inner_product_int8_sse( + const void *a, const void *b, size_t size, float *distance) { + const int8_t *lhs = reinterpret_cast(a); + const int8_t *rhs = reinterpret_cast(b); + + const int8_t *last = lhs + size; + const int8_t *last_aligned = lhs + ((size >> 5) << 5); + + __m128i xmm_sum_0 = _mm_setzero_si128(); + __m128i xmm_sum_1 = _mm_setzero_si128(); + + if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + __m128i xmm_lhs_0 = _mm_load_si128((const __m128i *)(lhs + 0)); + __m128i xmm_lhs_1 = _mm_load_si128((const __m128i *)(lhs + 16)); + __m128i xmm_rhs_0 = _mm_load_si128((const __m128i *)(rhs + 0)); + __m128i xmm_rhs_1 = _mm_load_si128((const __m128i *)(rhs + 16)); + + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); + xmm_sum_0 = + _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), + ONES_INT16_SSE), + xmm_sum_0); + xmm_sum_1 = + _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), + ONES_INT16_SSE), + xmm_sum_1); + } + + if (last >= last_aligned + 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); + + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + xmm_sum_0 = _mm_add_epi32( + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), ONES_INT16_SSE), + xmm_sum_0); + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + __m128i xmm_lhs_0 = _mm_loadu_si128((const __m128i *)(lhs + 0)); + __m128i xmm_lhs_1 = _mm_loadu_si128((const __m128i *)(lhs + 16)); + __m128i xmm_rhs_0 = _mm_loadu_si128((const __m128i *)(rhs + 0)); + __m128i xmm_rhs_1 = _mm_loadu_si128((const __m128i *)(rhs + 16)); + + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); + xmm_sum_0 = + _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), + ONES_INT16_SSE), + xmm_sum_0); + xmm_sum_1 = + _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), + ONES_INT16_SSE), + xmm_sum_1); + } + + if (last >= last_aligned + 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); + + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + xmm_sum_0 = _mm_add_epi32( + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), ONES_INT16_SSE), + xmm_sum_0); + lhs += 16; + rhs += 16; + } + } + float result = static_cast( + HorizontalAdd_INT32_V128(_mm_add_epi32(xmm_sum_0, xmm_sum_1))); + + switch (last - lhs) { + case 15: + FMA_INT8_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT8_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT8_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT8_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT8_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT8_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT8_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT8_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT8_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT8_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT8_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT8_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT8_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT8_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT8_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +template +__attribute__((always_inline)) void inner_product_int8_batch_sse_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + // TBD +} + +static __attribute__((always_inline)) void inner_product_int8_batch_sse( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_int8_batch_sse_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_int8_batch_sse_impl<1>(query, &vectors[i], prefetch_ptrs, dim, + distances + i); + } +} + +} // namespace zvec::turbo::sse::internal + +#endif // defined(__SSE__) diff --git a/src/turbo/sse/record_quantized_int8/cosine.cc b/src/turbo/sse/record_quantized_int8/cosine.cc new file mode 100644 index 000000000..879cf9c99 --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/cosine.cc @@ -0,0 +1,70 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sse/record_quantized_int8/cosine.h" +#include "sse/record_quantized_int8/common.h" + +#if defined(__SSE__) +#include +#endif + +namespace zvec::turbo::sse { + +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE__) + const int original_dim = dim - 24; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + static_cast(original_dim) * qb * mb); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __SSE__ +} + +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__SSE__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__SSE__ +} + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/cosine.h b/src/turbo/sse/record_quantized_int8/cosine.h new file mode 100644 index 000000000..e0ac7f556 --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/cosine.h @@ -0,0 +1,34 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized INT8 vector pair. +// `dim` includes the original vector bytes plus a 24-byte metadata tail +// (3 floats: scale_a, bias_a, sum_a). +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int8_distance. +// The query must have been preprocessed by cosine_int8_query_preprocess +// (int8 -> uint8 via +128 shift) before calling this function. +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/inner_product.cc b/src/turbo/sse/record_quantized_int8/inner_product.cc new file mode 100644 index 000000000..6b6c4d9c1 --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/inner_product.cc @@ -0,0 +1,75 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sse/record_quantized_int8/inner_product.h" +#include "sse/record_quantized_int8/common.h" + +#if defined(__SSE__) +#include +#endif + +namespace zvec::turbo::sse { + +// Compute squared Euclidean distance between a single quantized INT4 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE__) + const size_t original_dim = dim - 20; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + original_dim * qb * mb); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif //__SSE__ +} + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__SSE__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__SSE__ +} + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/inner_product.h b/src/turbo/sse/record_quantized_int8/inner_product.h new file mode 100644 index 000000000..9c6314b35 --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute inner product distance between a single quantized INT4 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int4_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/squared_euclidean.cc b/src/turbo/sse/record_quantized_int8/squared_euclidean.cc new file mode 100644 index 000000000..3fb001204 --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/squared_euclidean.cc @@ -0,0 +1,75 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sse/record_quantized_int8/squared_euclidean.h" +#include "sse/record_quantized_int8/common.h" +#if defined(__SSE__) +#include +#endif + +namespace zvec::turbo::sse { + +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE__) + const int original_dim = dim - 20; + if (original_dim <= 0) { + return; + } + internal::inner_product_int8_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float ma = a_tail[0]; + float mb = a_tail[1]; + float ms = a_tail[2]; + float ms2 = a_tail[3]; + + float qa = b_tail[0]; + float qb = b_tail[1]; + float qs = b_tail[2]; + float qs2 = b_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * original_dim + + 2 * (mb - qb) * (ms * ma - sum); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif +} + +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__SSE__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif +} + +} // namespace zvec::turbo::sse diff --git a/src/turbo/sse/record_quantized_int8/squared_euclidean.h b/src/turbo/sse/record_quantized_int8/squared_euclidean.h new file mode 100644 index 000000000..1e2cf45b4 --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/squared_euclidean.h @@ -0,0 +1,41 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute squared Euclidean distance between a single quantized INT8 +// vector pair. +// `dim` includes the original vector bytes plus a 20-byte metadata tail +// (4 floats: scale_a, bias_a, sum_a, sum2_a). +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared_euclidean_int8_distance. +// The query must have been preprocessed by +// squared_euclidean_int8_query_preprocess (int8 -> uint8 via +128 shift) +// before calling this function. +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +// Preprocess the query vector in-place (shift int8 -> uint8 by adding 128) +// for the batch path. Only the original_dim bytes are shifted; the metadata +// tail is left intact. `dim` includes the 20-byte metadata tail. +void squared_euclidean_int8_query_preprocess(void *query, size_t dim); + +} // namespace zvec::turbo::sse diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index a731cfed1..1fb5dcd7e 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -14,34 +14,344 @@ #include #include +#include "armv8/float32/cosine.h" +#include "armv8/float32/inner_product.h" +#include "armv8/float32/squared_euclidean.h" +#include "armv8/half_float/cosine.h" +#include "armv8/half_float/inner_product.h" +#include "armv8/half_float/squared_euclidean.h" +#include "avx/float32/cosine.h" +#include "avx/float32/inner_product.h" +#include "avx/float32/squared_euclidean.h" +#include "avx/half_float/cosine.h" +#include "avx/half_float/inner_product.h" +#include "avx/half_float/squared_euclidean.h" +#include "avx2/record_quantized_int4/cosine.h" +#include "avx2/record_quantized_int4/inner_product.h" +#include "avx2/record_quantized_int4/squared_euclidean.h" +#include "avx2/record_quantized_int8/cosine.h" +#include "avx2/record_quantized_int8/inner_product.h" +#include "avx2/record_quantized_int8/squared_euclidean.h" +#include "avx512/float32/cosine.h" +#include "avx512/float32/inner_product.h" +#include "avx512/float32/squared_euclidean.h" +#include "avx512/half_float/cosine.h" +#include "avx512/half_float/inner_product.h" +#include "avx512/half_float/squared_euclidean.h" +#include "avx512_fp16/half_float/cosine.h" +#include "avx512_fp16/half_float/inner_product.h" +#include "avx512_fp16/half_float/squared_euclidean.h" #include "avx512_vnni/record_quantized_int8/cosine.h" +#include "avx512_vnni/record_quantized_int8/inner_product.h" #include "avx512_vnni/record_quantized_int8/squared_euclidean.h" +#include "scalar/float32/cosine.h" +#include "scalar/float32/inner_product.h" +#include "scalar/float32/squared_euclidean.h" +#include "scalar/half_float/cosine.h" +#include "scalar/half_float/inner_product.h" +#include "scalar/half_float/squared_euclidean.h" +#include "scalar/record_quantized_int4/cosine.h" +#include "scalar/record_quantized_int4/inner_product.h" +#include "scalar/record_quantized_int4/squared_euclidean.h" +#include "scalar/record_quantized_int8/cosine.h" +#include "scalar/record_quantized_int8/inner_product.h" +#include "scalar/record_quantized_int8/squared_euclidean.h" +#include "sse/record_quantized_int4/cosine.h" +#include "sse/record_quantized_int4/inner_product.h" +#include "sse/record_quantized_int4/squared_euclidean.h" +#include "sse/record_quantized_int8/cosine.h" +#include "sse/record_quantized_int8/inner_product.h" +#include "sse/record_quantized_int8/squared_euclidean.h" namespace zvec::turbo { DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, - QuantizeType quantize_type) { + QuantizeType quantize_type, + CpuArchType cpu_arch_type) { +#if defined(__ARM_NEON) + // INT8 if (data_type == DataType::kInt8) { if (quantize_type == QuantizeType::kDefault) { - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_int8_distance; + } + + if (metric_type == MetricType::kCosine) { + return scalar::cosine_int8_distance; + } + + if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_int8_distance; + } + } + } + + // INT$ + if (data_type == DataType::kInt4) { + if (quantize_type == QuantizeType::kDefault) { + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_int4_distance; + } + + if (metric_type == MetricType::kCosine) { + return scalar::cosine_int4_distance; + } + + if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_int4_distance; + } + } + } + + // FP32 + if (data_type == DataType::kFp32) { + if (quantize_type == QuantizeType::kDefault) { + if (metric_type == MetricType::kSquaredEuclidean) { + return armv8::squared_euclidean_fp32_distance; + } + + if (metric_type == MetricType::kCosine) { + return armv8::cosine_fp32_distance; + } + + if (metric_type == MetricType::kInnerProduct) { + return armv8::inner_product_fp32_distance; + } + } + } + + // FP16 + if (data_type == DataType::kFp16) { + if (quantize_type == QuantizeType::kDefault) { + if (metric_type == MetricType::kSquaredEuclidean) { + return armv8::squared_euclidean_fp16_distance; + } + + if (metric_type == MetricType::kCosine) { + return armv8::cosine_fp16_distance; + } + + if (metric_type == MetricType::kInnerProduct) { + return armv8::inner_product_fp16_distance; + } + } + } +#else + // INT8 + if (data_type == DataType::kInt8) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512VNNI)) { if (metric_type == MetricType::kSquaredEuclidean) { return avx512_vnni::squared_euclidean_int8_distance; } if (metric_type == MetricType::kCosine) { return avx512_vnni::cosine_int8_distance; } + + if (metric_type == MetricType::kInnerProduct) { + return avx512_vnni::inner_product_int8_distance; + } + } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX2)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx2::squared_euclidean_int8_distance; + } + if (metric_type == MetricType::kCosine) { + return avx2::cosine_int8_distance; + } + + if (metric_type == MetricType::kInnerProduct) { + return avx2::inner_product_int8_distance; + } + } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kSSE)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return sse::squared_euclidean_int8_distance; + } + if (metric_type == MetricType::kCosine) { + return sse::cosine_int8_distance; + } + + if (metric_type == MetricType::kInnerProduct) { + return sse::inner_product_int8_distance; + } + } + + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_int8_distance; + } + if (metric_type == MetricType::kCosine) { + return scalar::cosine_int8_distance; + } + + if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_int8_distance; } } } + + // INT4 + if (data_type == DataType::kInt4) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX2)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx2::squared_euclidean_int4_distance; + } + if (metric_type == MetricType::kCosine) { + return avx2::cosine_int4_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx2::inner_product_int4_distance; + } + } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kSSE)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return sse::squared_euclidean_int4_distance; + } + if (metric_type == MetricType::kCosine) { + return sse::cosine_int4_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return sse::inner_product_int4_distance; + } + } + + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_int4_distance; + } else if (metric_type == MetricType::kCosine) { + return scalar::cosine_int4_distance; + } else if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_int4_distance; + } + } + } + + // FP32 + if (data_type == DataType::kFp32) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512F && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx512::squared_euclidean_fp32_distance; + } + if (metric_type == MetricType::kCosine) { + return avx512::cosine_fp32_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx512::inner_product_fp32_distance; + } + } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx::squared_euclidean_fp32_distance; + } + if (metric_type == MetricType::kCosine) { + return avx::cosine_fp32_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx::inner_product_fp32_distance; + } + } + + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_fp32_distance; + } + if (metric_type == MetricType::kCosine) { + return scalar::cosine_fp32_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_fp32_distance; + } + } + } + + // FP16 + if (data_type == DataType::kFp16) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_FP16 && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512FP16)) { + if (metric_type == MetricType::kInnerProduct) { + return avx512_fp16::inner_product_fp16_distance; + } + if (metric_type == MetricType::kCosine) { + return avx512_fp16::cosine_fp16_distance; + } + if (metric_type == MetricType::kSquaredEuclidean) { + return avx512_fp16::squared_euclidean_fp16_distance; + } + } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512F && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx512::squared_euclidean_fp16_distance; + } + if (metric_type == MetricType::kCosine) { + return avx512::cosine_fp16_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx512::inner_product_fp16_distance; + } + } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx::squared_euclidean_fp16_distance; + } + if (metric_type == MetricType::kCosine) { + return avx::cosine_fp16_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx::inner_product_fp16_distance; + } + } + + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_fp16_distance; + } + if (metric_type == MetricType::kCosine) { + return scalar::cosine_fp16_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_fp16_distance; + } + } + } +#endif + return nullptr; } BatchDistanceFunc get_batch_distance_func(MetricType metric_type, DataType data_type, - QuantizeType quantize_type) { + QuantizeType quantize_type, + CpuArchType cpu_arch_type) { if (data_type == DataType::kInt8) { if (quantize_type == QuantizeType::kDefault) { - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512VNNI)) { if (metric_type == MetricType::kSquaredEuclidean) { return avx512_vnni::squared_euclidean_int8_batch_distance; } @@ -51,15 +361,37 @@ BatchDistanceFunc get_batch_distance_func(MetricType metric_type, } } } + + if (data_type == DataType::kInt4) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX2)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx2::squared_euclidean_int4_batch_distance; + } + if (metric_type == MetricType::kCosine) { + return avx2::cosine_int4_batch_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx2::inner_product_int4_batch_distance; + } + } + } + } + return nullptr; } QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type, DataType data_type, - QuantizeType quantize_type) { + QuantizeType quantize_type, + CpuArchType cpu_arch_type) { if (data_type == DataType::kInt8) { if (quantize_type == QuantizeType::kDefault) { - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512VNNI)) { if (metric_type == MetricType::kSquaredEuclidean) { return avx512_vnni::squared_euclidean_int8_query_preprocess; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7be2294dd..e3b54ee24 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -7,6 +7,7 @@ include_directories(${PROJECT_ROOT_DIR}) cc_directories(ailego) cc_directories(db) cc_directories(core) +cc_directories(turbo) if(BUILD_C_BINDINGS) cc_directories(c) endif() diff --git a/tests/core/algorithm/hnsw/hnsw_streamer_test.cc b/tests/core/algorithm/hnsw/hnsw_streamer_test.cc index 3f27f5252..1ee7ef6d1 100644 --- a/tests/core/algorithm/hnsw/hnsw_streamer_test.cc +++ b/tests/core/algorithm/hnsw/hnsw_streamer_test.cc @@ -3471,93 +3471,6 @@ TEST_F(HnswStreamerTest, TestGroupInBruteforceSearch) { } } -#if 0 -TEST_F(HnswStreamerTest, TestBinaryConverter) { - uint32_t dimension = 2560; - - IndexStreamer::Pointer streamer = - IndexFactory::CreateStreamer("HnswStreamer"); - ASSERT_TRUE(streamer != nullptr); - - ailego::Params params; - // params.set(PARAM_HNSW_STREAMER_MAX_NEIGHBOR_COUNT, 10); - // params.set(PARAM_HNSW_STREAMER_SCALING_FACTOR, 16); - // params.set(PARAM_HNSW_STREAMER_EFCONSTRUCTION, 10); - // params.set(PARAM_HNSW_STREAMER_EF, 5); - params.set(PARAM_HNSW_STREAMER_BRUTE_FORCE_THRESHOLD, 1000U); - - ailego::Params stg_params; - - IndexMeta index_meta_raw(IndexMeta::DataType::DT_FP32, dimension); - index_meta_raw.set_metric("InnerProduct", 0, ailego::Params()); - - ailego::Params converter_params; - auto converter = IndexFactory::CreateConverter("BinaryConverter"); - ASSERT_TRUE(converter != nullptr); - - converter->init(index_meta_raw, converter_params); - - IndexMeta index_meta = converter->meta(); - - auto reformer = IndexFactory::CreateReformer(index_meta.reformer_name()); - ASSERT_TRUE(reformer != nullptr); - - ASSERT_EQ(0, reformer->init(index_meta.reformer_params())); - - auto storage = IndexFactory::CreateStorage("MMapFileStorage"); - ASSERT_EQ(0, storage->init(stg_params)); - ASSERT_EQ(0, storage->open(dir_ + "TestBinaryConverter.index", true)); - ASSERT_EQ(0, streamer->init(index_meta, params)); - ASSERT_EQ(0, streamer->open(storage)); - - size_t cnt = 5000U; - auto ctx = streamer->create_context(); - ASSERT_TRUE(!!ctx); - - IndexQueryMeta qmeta(IndexMeta::DataType::DT_FP32, dimension); - - std::random_device rd; - std::mt19937 gen(rd()); - - std::uniform_real_distribution dist(-2.0, 2.0); - std::vector> vecs; - - for (size_t i = 0; i < cnt; i++) { - NumericalVector vec(dimension); - for (size_t j = 0; j < dimension; ++j) { - vec[j] = dist(gen); - } - - std::string new_vec; - IndexQueryMeta new_meta; - - ASSERT_EQ(0, reformer->convert(vec.data(), qmeta, &new_vec, &new_meta)); - ASSERT_EQ(0, streamer->add_impl(i, new_vec.data(), new_meta, ctx)); - - vecs.push_back(vec); - } - - size_t query_cnt = 200U; - auto knnCtx = streamer->create_context(); - - float epison = 1e-6; - for (size_t i = 0; i < query_cnt; i++) { - auto &vec = vecs[i]; - std::string new_query; - IndexQueryMeta new_meta; - ASSERT_EQ(0, reformer->transform(vec.data(), qmeta, &new_query, &new_meta)); - - size_t topk = 50; - knnCtx->set_topk(topk); - ASSERT_EQ(0, streamer->search_impl(new_query.data(), new_meta, knnCtx)); - auto &results = knnCtx->result(); - ASSERT_EQ(topk, results.size()); - ASSERT_EQ(i, results[0].key()); - ASSERT_NEAR(0, results[0].score(), epison); - } -} -#endif - TEST_F(HnswStreamerTest, TestAddAndSearchWithID) { IndexStreamer::Pointer streamer = IndexFactory::CreateStreamer("HnswStreamer"); @@ -3671,131 +3584,134 @@ TEST_F(HnswStreamerTest, TestAddAndSearchWithID) { // EXPECT_GT(cost, 2.0f); } -#if 0 -TEST_F(HnswStreamerTest, TestBasicRefiner) { - uint32_t dimension = 1120; - - IndexStreamer::Pointer base_streamer = +TEST_F(HnswStreamerTest, TestTurboCosineInt8Quantizer) { + IndexStreamer::Pointer streamer = IndexFactory::CreateStreamer("HnswStreamer"); - ASSERT_TRUE(base_streamer != nullptr); + ASSERT_TRUE(streamer != nullptr); - IndexStreamer::Pointer refine_streamer = - IndexFactory::CreateStreamer("FlatStreamer"); - ASSERT_TRUE(refine_streamer != nullptr); + ailego::Params params; + params.set(PARAM_HNSW_STREAMER_MAX_NEIGHBOR_COUNT, 50); + params.set(PARAM_HNSW_STREAMER_SCALING_FACTOR, 16); + params.set(PARAM_HNSW_STREAMER_EFCONSTRUCTION, 100); + params.set(PARAM_HNSW_STREAMER_EF, 100); + params.set(PARAM_HNSW_STREAMER_BRUTE_FORCE_THRESHOLD, 1000U); + params.set(PARAM_HNSW_STREAMER_GET_VECTOR_ENABLE, true); - IndexRefiner::Pointer refiner = IndexFactory::CreateRefiner("BasicRefiner"); - ASSERT_TRUE(refiner != nullptr); + ailego::Params stg_params; - ailego::Params params; - IndexMeta index_meta(IndexMeta::DataType::DT_FP32, dimension); - index_meta.set_metric("InnerProduct", 0, ailego::Params()); + IndexMeta index_meta_raw(IndexMeta::DataType::DT_FP32, dim); + index_meta_raw.set_metric("Cosine", 0, ailego::Params()); ailego::Params converter_params; - auto converter = IndexFactory::CreateConverter("BinaryConverter"); - ASSERT_TRUE(converter != nullptr); + auto quantizer = IndexFactory::CreateQuantier("Int8Quantizer"); + ASSERT_TRUE(quantizer != nullptr); - converter->init(index_meta, converter_params); + quantizer->init(index_meta_raw, quantizer_params); - IndexMeta index_meta_binary = converter->meta(); + IndexMeta index_meta = quantizer->meta(); - auto reformer = - IndexFactory::CreateReformer(index_meta_binary.reformer_name()); - ASSERT_TRUE(reformer != nullptr); + auto storage = IndexFactory::CreateStorage("MMapFileStorage"); + ASSERT_EQ(0, storage->init(stg_params)); + ASSERT_EQ(0, + storage->open(dir_ + "TestTurboCosineInt8Quantizer.index", true)); + ASSERT_EQ(0, streamer->init(index_meta, params)); + ASSERT_EQ(0, streamer->open(storage)); - ASSERT_EQ(0, reformer->init(index_meta_binary.reformer_params())); + NumericalVector vec(dim); + size_t cnt = 2000U; + auto ctx = streamer->create_context(); + ASSERT_TRUE(!!ctx); - // base streamer - ailego::Params base_stg_params; - auto base_storage = IndexFactory::CreateStorage("MMapFileStorage"); - ASSERT_EQ(0, base_storage->init(base_stg_params)); - ASSERT_EQ(0, base_storage->open(dir_ + "TestBasicRefinerBase.index", true)); - ASSERT_EQ(0, base_streamer->init(index_meta_binary, params)); - ASSERT_EQ(0, base_streamer->open(base_storage)); + IndexQueryMeta qmeta(IndexMeta::DataType::DT_FP32, dim); + IndexQueryMeta new_meta; - auto base_ctx = base_streamer->create_context(); - ASSERT_TRUE(!!base_ctx); + const float epsilon = 1e-2; + float fixed_value = float(cnt) / 2; + for (size_t i = 0; i < cnt; i++) { + float add_on = i * 10; + for (size_t j = 0; j < dim; ++j) { + if (j < dim / 4) + vec[j] = fixed_value; + else + vec[j] = fixed_value + add_on; + } - // refine streamer - ailego::Params refine_stg_params; - auto refine_storage = IndexFactory::CreateStorage("MMapFileStorage"); - ASSERT_EQ(0, refine_storage->init(refine_stg_params)); - ASSERT_EQ(0, - refine_storage->open(dir_ + "TestBasicRefinerRefine.index", true)); - ASSERT_EQ(0, refine_streamer->init(index_meta, params)); - ASSERT_EQ(0, refine_streamer->open(refine_storage)); - auto refine_ctx = refine_streamer->create_context(); - ASSERT_TRUE(!!refine_ctx); + std::string new_vec; - ailego::Params refiner_params; - ASSERT_EQ(0, refiner->init(base_streamer, refine_streamer, refiner_params)); + ASSERT_EQ(0, quantizer->convert(vec.data(), qmeta, &new_vec, &new_meta)); + ASSERT_EQ(0, streamer->add_impl(i, new_vec.data(), new_meta, ctx)); + } - auto ctx = refiner->create_context(); - ASSERT_TRUE(!!ctx); + for (size_t i = 0; i < cnt; i++) { + float add_on = i * 10; - IndexQueryMeta qmeta(IndexMeta::DataType::DT_FP32, dimension); + const void *vector = streamer->get_vector(i); + ASSERT_NE(vector, nullptr); - std::random_device rd; - std::mt19937 gen(rd()); + std::string denormalized_vec; + denormalized_vec.resize(dim * sizeof(float)); + quantizer->revert(vector, new_meta, &denormalized_vec); - std::uniform_real_distribution dist(-2.0, 2.0); - std::vector> vecs; + float vector_value = *((float *)(denormalized_vec.data()) + dim - 1); + EXPECT_NEAR(vector_value, fixed_value + add_on, epsilon); + } - size_t cnt = 5000U; - for (size_t i = 0; i < cnt; i++) { - NumericalVector vec(dimension); - for (size_t j = 0; j < dimension; ++j) { - vec[j] = dist(gen); + auto linearCtx = streamer->create_context(); + linearCtx->set_fetch_vector(true); + auto knnCtx = streamer->create_context(); + knnCtx->set_fetch_vector(true); + + size_t query_cnt = 200U; + size_t topk = 200; + linearCtx->set_topk(topk); + knnCtx->set_topk(topk); + uint64_t knnTotalTime = 0; + uint64_t linearTotalTime = 0; + for (size_t i = 0; i < query_cnt; i++) { + float add_on = i * 10; + for (size_t j = 0; j < dim; ++j) { + if (j < dim / 4) + vec[j] = fixed_value; + else + vec[j] = fixed_value + add_on; } - std::string binary_vec; - IndexQueryMeta binary_qmeta; + std::string new_query; + IndexQueryMeta new_meta; + ASSERT_EQ(0, quantizer->quantize(vec.data(), qmeta, &new_query, &new_meta)); + auto t1 = ailego::Realtime::MicroSeconds(); + ASSERT_EQ(0, streamer->search_impl(new_query.data(), new_meta, knnCtx)); + auto t2 = ailego::Realtime::MicroSeconds(); ASSERT_EQ(0, - reformer->convert(vec.data(), qmeta, &binary_vec, &binary_qmeta)); - ASSERT_EQ(0, refiner->add_impl(i, binary_vec.data(), binary_qmeta, - vec.data(), qmeta, ctx)); - - vecs.push_back(vec); - } + streamer->search_bf_impl(new_query.data(), new_meta, linearCtx)); + auto t3 = ailego::Realtime::MicroSeconds(); - size_t query_cnt = 200U; - // size_t query_cnt = 1U; + knnTotalTime += t2 - t1; + linearTotalTime += t3 - t2; - auto searcherCtx = refiner->create_context(); + auto &knnResult = knnCtx->result(); + ASSERT_EQ(topk, knnResult.size()); - for (size_t i = 0; i < query_cnt; i++) { - auto &vec = vecs[i]; + auto &linearResult = linearCtx->result(); + ASSERT_EQ(topk, linearResult.size()); + ASSERT_EQ(i, linearResult[0].key()); - // float abs_value{0}; - // for (size_t j = 0; j < dimension; ++j) { - // std::cout << "dim: " << j << ", value: " << vec[j] << std::endl; + ASSERT_NE(knnResult[0].vector(), nullptr); + ASSERT_NE(linearResult[0].vector(), nullptr); - // abs_value += std::abs(vec[j]); - // } - // std::cout << "abs value: " << abs_value << std::endl; + std::string denormalized_vec; + denormalized_vec.resize(dim * sizeof(float)); + quantizer->dequantize(linearResult[0].vector(), new_meta, + &denormalized_vec); - std::string new_query; - IndexQueryMeta binary_qmeta; - ASSERT_EQ( - 0, reformer->transform(vec.data(), qmeta, &new_query, &binary_qmeta)); - - size_t topk = 50; - searcherCtx->set_topk(topk); - ASSERT_EQ(0, refiner->search_impl(new_query.data(), binary_qmeta, - vec.data(), qmeta, searcherCtx)); - auto &results = searcherCtx->result(); - ASSERT_EQ(topk, results.size()); - ASSERT_EQ(i, results[0].key()); - - // for (size_t i = 0; i < results.size(); ++i) { - // std::cout << i << ", id: " << results[i].index() - // << ", score: " << results[i].score() << std::endl; - // } + float vector_value = *(((float *)(denormalized_vec.data()) + dim - 1)); + EXPECT_NEAR(vector_value, fixed_value + add_on, epsilon); } -} - -#endif + std::cout << "knnTotalTime: " << knnTotalTime << std::endl; + std::cout << "linearTotalTime: " << linearTotalTime << std::endl; +} } // namespace core } // namespace zvec diff --git a/tests/core/metric/quantized_integer_metric_test.cc b/tests/core/metric/quantized_integer_metric_test.cc index 835a07fb7..f56d6ef67 100644 --- a/tests/core/metric/quantized_integer_metric_test.cc +++ b/tests/core/metric/quantized_integer_metric_test.cc @@ -32,8 +32,7 @@ using namespace zvec::ailego; static IndexHolder::Pointer GetHolder( size_t dim, size_t count, std::uniform_real_distribution &dist) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); auto holder = std::make_shared>(dim); for (size_t i = 0; i < count; ++i) { ailego::NumericalVector vec(dim); @@ -71,8 +70,7 @@ TEST(QuantizedIntegerMetric, General) { Params params; - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 1.0); const size_t DIMENSION = 21; ailego::NumericalVector x(DIMENSION); @@ -141,8 +139,7 @@ TEST(QuantizedIntegerMetric, General) { } TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); @@ -202,8 +199,7 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { } TEST(QuantizedIntegerMetric, TestInt8SquaredEuclideanReformer) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); std::uniform_int_distribution dist2(0, 1); @@ -344,8 +340,7 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclideanMetric) { } TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; @@ -404,8 +399,7 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { } TEST(QuantizedIntegerMetric, TestInt4SquaredEuclideanReformer) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); std::uniform_int_distribution dist2(0, 1); @@ -546,8 +540,7 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclideanMetric) { } TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); @@ -631,8 +624,7 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProductMetric) { } TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; @@ -716,8 +708,7 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProductMetric) { } TEST(QuantizedIntegerMetric, TestInt8MipsSquaredEuclidean) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); @@ -805,8 +796,7 @@ TEST(QuantizedIntegerMetric, TestInt8MipsSquaredEuclideanMetric) { } TEST(QuantizedIntegerMetric, TestInt4MipsSquaredEuclidean) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; @@ -890,8 +880,7 @@ TEST(QuantizedIntegerMetric, TestInt4MipsSquaredEuclideanMetric) { } TEST(QuantizedIntegerMetric, TestInt8NormalizedCosine) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); @@ -990,8 +979,7 @@ TEST(QuantizedIntegerMetric, TestInt8NormalizedCosineMetric) { } TEST(QuantizedIntegerMetric, TestInt8Cosine) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); @@ -1071,8 +1059,7 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) { } TEST(QuantizedIntegerMetric, TestInt4NormalizedCosine) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; diff --git a/tests/turbo/CMakeLists.txt b/tests/turbo/CMakeLists.txt new file mode 100644 index 000000000..0e864858a --- /dev/null +++ b/tests/turbo/CMakeLists.txt @@ -0,0 +1,14 @@ +include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) + +file(GLOB_RECURSE ALL_TEST_SRCS *_test.cc) + +foreach(CC_SRCS ${ALL_TEST_SRCS}) + get_filename_component(CC_TARGET ${CC_SRCS} NAME_WE) + cc_gtest( + NAME ${CC_TARGET} + STRICT + LIBS zvec_ailego core_framework core_metric core_quantizer + SRCS ${CC_SRCS} + INCS . ${PROJECT_ROOT_DIR}/src/core/ + ) +endforeach() \ No newline at end of file diff --git a/tests/turbo/turbo_cosine_test.cc b/tests/turbo/turbo_cosine_test.cc new file mode 100644 index 000000000..ece33613d --- /dev/null +++ b/tests/turbo/turbo_cosine_test.cc @@ -0,0 +1,366 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "zvec/core/framework/index_factory.h" + +using namespace zvec; +using namespace zvec::core; +using namespace zvec::ailego; + +// Target Test Type: avx, avx512, scalar +TEST(CosineMetric, TestFp32Cosine) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + + auto converter = IndexFactory::CreateConverter("CosineFp32Converter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_scalar{0.0f}; + float score_avx{0.0f}; + float score_avx512{0.0f}; + + func_scalar(doc_vec.data(), query_vec.data(), DIMENSION, &score_scalar); + + func_avx512(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx512); + + func_avx(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx); + + float epsilon = 0.001; + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(CosineMetric, TestFp16Cosine) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + + auto converter = IndexFactory::CreateConverter("CosineFp16Converter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_avx512fp16 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_avx512fp16{0.0f}; + float score_avx512{0.0f}; + float score_avx{0.0f}; + float score_scalar{0.0f}; + + func_avx512fp16(doc_out.data(), query_out.data(), + qmeta_reformer.dimension(), &score_avx512fp16); + + func_avx512(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx512); + + func_avx(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + float epsilon = 0.2; + ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon); + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); + } +} + +// Target Test Type: avx, avx512, scalar +TEST(CosineMetric, TestFp32CosineBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("CosineFp32Converter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_vecs[k].data(); + } + + std::vector score_scalar(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + + batch_func_scalar(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_scalar[0]); + + batch_func_avx512(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_avx512[0]); + + batch_func_avx(doc_ptrs.data(), query_vec.data(), DIMENSION, BATCH_SIZE, + &score_avx[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.001; + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + doc_outs.clear(); + } + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(CosineMetric, TestFp16CosineBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("CosineFp16Converter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto batch_func_avx512fp16 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + doc_outs.push_back(doc_out); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_outs[k].data(); + } + + std::vector score_avx512fp16(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_scalar(BATCH_SIZE, 0.0f); + + batch_func_avx512fp16(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512fp16[0]); + + batch_func_avx512(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512[0]); + + batch_func_avx(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, &score_avx[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_scalar[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.2; + ASSERT_NEAR(score_scalar[j], score_avx512fp16[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + doc_outs.clear(); + } + } +} diff --git a/tests/turbo/turbo_euclidean_test.cc b/tests/turbo/turbo_euclidean_test.cc new file mode 100644 index 000000000..8388489f4 --- /dev/null +++ b/tests/turbo/turbo_euclidean_test.cc @@ -0,0 +1,316 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include +#include "zvec/core/framework/index_factory.h" + +using namespace zvec; +using namespace zvec::core; +using namespace zvec::ailego; + +// Target Test Type: avx, avx512, scalar +TEST(SquaredEuclideanMetric, TestFp32SquaredEuclidean) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + float score_scalar{0.0f}; + float score_avx{0.0f}; + float score_avx512{0.0f}; + + func_scalar(doc_vec.data(), query_vec.data(), DIMENSION, &score_scalar); + + func_avx512(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx512); + + func_avx(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx); + + float epsilon = 0.001; + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(SquaredEuclideanMetric, TestFp16SquaredEuclidean) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + + auto converter = IndexFactory::CreateConverter("HalfFloatConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_avx512fp16 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_avx512fp16{0.0f}; + float score_avx512{0.0f}; + float score_avx{0.0f}; + float score_scalar{0.0f}; + + func_avx512fp16(doc_out.data(), query_out.data(), + qmeta_reformer.dimension(), &score_avx512fp16); + + func_avx512(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx512); + + func_avx(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + float epsilon = 0.2; + ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon); + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); + } +} + +// Target Test Type: avx, avx512, scalar +TEST(SquaredEuclideanMetric, TestFp32SquaredEuclideanBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + std::vector> doc_vecs; + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + doc_vecs.push_back(doc_vec); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_vecs[k].data(); + } + + std::vector score_scalar(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + + batch_func_scalar(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_scalar[0]); + + batch_func_avx512(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_avx512[0]); + + batch_func_avx(doc_ptrs.data(), query_vec.data(), DIMENSION, BATCH_SIZE, + &score_avx[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.001; + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(SquaredEuclideanMetric, TestFp16SquaredEuclideanBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("HalfFloatConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto batch_func_avx512fp16 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_outs[k].data(); + } + + std::vector score_avx512fp16(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_scalar(BATCH_SIZE, 0.0f); + + batch_func_avx512fp16(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512fp16[0]); + + batch_func_avx512(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512[0]); + + batch_func_avx(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, &score_avx[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_scalar[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.2; + ASSERT_NEAR(score_scalar[j], score_avx512fp16[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + doc_outs.clear(); + } + } +} diff --git a/tests/turbo/turbo_inner_product_test.cc b/tests/turbo/turbo_inner_product_test.cc new file mode 100644 index 000000000..14fc2cfc0 --- /dev/null +++ b/tests/turbo/turbo_inner_product_test.cc @@ -0,0 +1,317 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include +#include "zvec/core/framework/index_factory.h" + +using namespace zvec; +using namespace zvec::core; +using namespace zvec::ailego; + +// Target Test Type: avx, avx512, scalar +TEST(InnerProductMetric, TestFp32InnerProduct) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + float score_scalar{0.0f}; + float score_avx{0.0f}; + float score_avx512{0.0f}; + + func_scalar(doc_vec.data(), query_vec.data(), DIMENSION, &score_scalar); + + func_avx512(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx512); + + func_avx(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx); + + float epsilon = 0.001; + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(InnerProductMetric, TestFp16InnerProduct) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + + auto converter = IndexFactory::CreateConverter("HalfFloatConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_avx512fp16 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_avx512fp16{0.0f}; + float score_avx512{0.0f}; + float score_avx{0.0f}; + float score_scalar{0.0f}; + + func_avx512fp16(doc_out.data(), query_out.data(), + qmeta_reformer.dimension(), &score_avx512fp16); + + func_avx512(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx512); + + func_avx(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + float epsilon = 0.2; + ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon); + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); + } +} + +// Target Test Type: avx, avx512, scalar +TEST(InnerProductMetric, TestFp32InnerProductBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + std::vector> doc_vecs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_vecs[k].data(); + } + + std::vector score_scalar(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + + batch_func_scalar(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_scalar[0]); + batch_func_avx512(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_avx512[0]); + batch_func_avx(doc_ptrs.data(), query_vec.data(), DIMENSION, BATCH_SIZE, + &score_avx[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.001; + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(InnerProductMetric, TestFp16InnerProductBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("HalfFloatConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto batch_func_avx512fp16 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + doc_outs.push_back(doc_out); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_outs[k].data(); + } + + std::vector score_avx512fp16(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_scalar(BATCH_SIZE, 0.0f); + + batch_func_avx512fp16(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512fp16[0]); + + batch_func_avx512(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512[0]); + + batch_func_avx(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, &score_avx[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_scalar[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.2; + ASSERT_NEAR(score_scalar[j], score_avx512fp16[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + doc_outs.clear(); + } + } +} diff --git a/tests/turbo/turbo_quantized_integer_test.cc b/tests/turbo/turbo_quantized_integer_test.cc new file mode 100644 index 000000000..3394a27a0 --- /dev/null +++ b/tests/turbo/turbo_quantized_integer_test.cc @@ -0,0 +1,1300 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace zvec; +using namespace zvec::core; +using namespace zvec::ailego; + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + + auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto func_avx512vnni = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI); + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx512vnni{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + func_avx512vnni(doc_out.data(), query_out.data(), + qmeta_reformer.dimension(), &score_avx512vnni); + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float32, score_avx512vnni, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1024; + + auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + + auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1024; + + auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt8Cosine) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + + // fp32 converter + auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter"); + ASSERT_TRUE(!!fp32_converter); + ASSERT_EQ(0u, fp32_converter->init(meta, Params())); + + auto &fp32_convert_meta = fp32_converter->meta(); + auto fp32_reformer = + IndexFactory::CreateReformer(fp32_convert_meta.reformer_name()); + ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params())); + + // int8 converter + auto converter = IndexFactory::CreateConverter("CosineInt8Converter"); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto func_avx512vnni = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI); + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta fp32_qmeta_reformer; + + std::string fp32_query_out; + ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta, + &fp32_query_out, &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx512vnni{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + std::string fp32_doc_out; + ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out, + &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + func_float32(fp32_query_out.data(), fp32_doc_out.data(), + fp32_qmeta_reformer.dimension(), &score_float32); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + func_avx512vnni(doc_out.data(), query_out.data(), + qmeta_reformer.dimension(), &score_avx512vnni); + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float32, score_avx512vnni, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt4Cosine) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1024; + + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + + // fp32 converter + auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter"); + ASSERT_TRUE(!!fp32_converter); + ASSERT_EQ(0u, fp32_converter->init(meta, Params())); + + auto &fp32_convert_meta = fp32_converter->meta(); + auto fp32_reformer = + IndexFactory::CreateReformer(fp32_convert_meta.reformer_name()); + ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params())); + + // int4 converter + auto converter = IndexFactory::CreateConverter("CosineInt4Converter"); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta fp32_qmeta_reformer; + + std::string fp32_query_out; + ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta, + &fp32_query_out, &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + std::string fp32_doc_out; + ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out, + &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + func_float32(fp32_query_out.data(), fp32_doc_out.data(), + fp32_qmeta_reformer.dimension(), &score_float32); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt8InnerProductBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx512vnni = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector scores_float32(BATCH_SIZE, 0.0f); + std::vector scores_scalar(BATCH_SIZE, 0.0f); + std::vector scores_avx512vnni(BATCH_SIZE, 0.0f); + std::vector scores_avx2(BATCH_SIZE, 0.0f); + std::vector scores_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector float_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + float_ptrs[k] = doc_vecs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE, + DIMENSION, &scores_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_scalar[0]); + + batch_func_avx512vnni(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_avx512vnni[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(scores_float32[j], scores_avx512vnni[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_scalar[j], scores_avx2[j], 0.001); + ASSERT_NEAR(scores_scalar[j], scores_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt4InnerProductBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_outs.size() == BATCH_SIZE) { + std::vector scores_float32(BATCH_SIZE, 0.0f); + std::vector scores_scalar(BATCH_SIZE, 0.0f); + std::vector scores_avx2(BATCH_SIZE, 0.0f); + std::vector scores_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector float_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + float_ptrs[k] = doc_vecs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE, + DIMENSION, &scores_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_scalar[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_scalar[j], scores_avx2[j], 0.001); + ASSERT_NEAR(scores_scalar[j], scores_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt8SquaredEuclideanBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_outs.size() == BATCH_SIZE) { + std::vector scores_float32(BATCH_SIZE, 0.0f); + std::vector scores_scalar(BATCH_SIZE, 0.0f); + std::vector scores_avx2(BATCH_SIZE, 0.0f); + std::vector scores_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector float_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + float_ptrs[k] = doc_vecs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE, + DIMENSION, &scores_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_scalar[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_scalar[j], scores_avx2[j], 0.001); + ASSERT_NEAR(scores_scalar[j], scores_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt4SquaredEuclideanBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_outs.size() == BATCH_SIZE) { + std::vector scores_float32(BATCH_SIZE, 0.0f); + std::vector scores_scalar(BATCH_SIZE, 0.0f); + std::vector scores_avx2(BATCH_SIZE, 0.0f); + std::vector scores_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector float_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + float_ptrs[k] = doc_vecs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE, + DIMENSION, &scores_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_scalar[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.001); + ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt8CosineBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + + // fp32 converter + auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter"); + ASSERT_TRUE(!!fp32_converter); + ASSERT_EQ(0u, fp32_converter->init(meta, Params())); + + auto &fp32_convert_meta = fp32_converter->meta(); + auto fp32_reformer = + IndexFactory::CreateReformer(fp32_convert_meta.reformer_name()); + ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params())); + + // int8 converter + auto converter = IndexFactory::CreateConverter("CosineInt8Converter"); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx512vnni = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta fp32_qmeta_reformer; + + std::string fp32_query_out; + ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta, + &fp32_query_out, &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + IndexQueryMeta qmeta_reformer; + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + std::vector fp32_doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string fp32_doc_out; + ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out, + &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + fp32_doc_outs.push_back(fp32_doc_out); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_outs.size() == BATCH_SIZE) { + std::vector score_float32(BATCH_SIZE, 0.0f); + std::vector score_scalar(BATCH_SIZE, 0.0f); + std::vector score_avx512vnni(BATCH_SIZE, 0.0f); + std::vector score_avx2(BATCH_SIZE, 0.0f); + std::vector score_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector fp32_doc_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + fp32_doc_ptrs[k] = fp32_doc_outs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(fp32_doc_ptrs.data(), fp32_query_out.data(), + BATCH_SIZE, fp32_qmeta_reformer.dimension(), + &score_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_scalar[0]); + + batch_func_avx512vnni(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_avx512vnni[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(score_float32[j], score_avx512vnni[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_float32[j], score_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_float32[j], score_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_float32[j], score_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar[j], score_avx2[j], 0.001); + ASSERT_NEAR(score_scalar[j], score_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + fp32_doc_outs.clear(); + } + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt4CosineBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + + // fp32 converter + auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter"); + ASSERT_TRUE(!!fp32_converter); + ASSERT_EQ(0u, fp32_converter->init(meta, Params())); + + auto &fp32_convert_meta = fp32_converter->meta(); + auto fp32_reformer = + IndexFactory::CreateReformer(fp32_convert_meta.reformer_name()); + ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params())); + + // int4 converter + auto converter = IndexFactory::CreateConverter("CosineInt4Converter"); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta fp32_qmeta_reformer; + + std::string fp32_query_out; + ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta, + &fp32_query_out, &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + IndexQueryMeta qmeta_reformer; + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + std::vector fp32_doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string fp32_doc_out; + ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out, + &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + fp32_doc_outs.push_back(fp32_doc_out); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_outs.size() == BATCH_SIZE) { + std::vector score_float32(BATCH_SIZE, 0.0f); + std::vector score_scalar(BATCH_SIZE, 0.0f); + std::vector score_avx2(BATCH_SIZE, 0.0f); + std::vector score_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector fp32_doc_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + fp32_doc_ptrs[k] = fp32_doc_outs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(fp32_doc_ptrs.data(), fp32_query_out.data(), + BATCH_SIZE, fp32_qmeta_reformer.dimension(), + &score_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_scalar[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(score_float32[j], score_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_float32[j], score_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_float32[j], score_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar[j], score_avx2[j], 0.001); + ASSERT_NEAR(score_scalar[j], score_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + fp32_doc_outs.clear(); + } + } +} \ No newline at end of file