From 59eea4c5c14099ed0b8e793034b82fdf5bf7a12d Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 24 Mar 2026 11:36:47 +0800 Subject: [PATCH 01/44] refactor: add extra meta size --- src/core/framework/index_meta.cc | 4 +++- src/include/zvec/core/framework/index_meta.h | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/core/framework/index_meta.cc b/src/core/framework/index_meta.cc index 11d54cb63..d0eadb02d 100644 --- a/src/core/framework/index_meta.cc +++ b/src/core/framework/index_meta.cc @@ -30,7 +30,8 @@ struct IndexMetaFormatHeader { uint32_t space_id; uint32_t attachment_offset; uint32_t attachment_size; - uint8_t reserved_[4092]; + uint32_t extra_meta_size; + uint8_t reserved_[4088]; }; static_assert(sizeof(IndexMetaFormatHeader) % 32 == 0, @@ -47,6 +48,7 @@ void IndexMeta::serialize(std::string *out) const { format.dimension = dimension_; format.unit_size = unit_size_; format.space_id = space_id_; + format.extra_meta_size = extra_meta_size_; if (!metric_name_.empty()) { ailego::Params item; diff --git a/src/include/zvec/core/framework/index_meta.h b/src/include/zvec/core/framework/index_meta.h index 3a09aaefb..225b9d0da 100644 --- a/src/include/zvec/core/framework/index_meta.h +++ b/src/include/zvec/core/framework/index_meta.h @@ -38,6 +38,16 @@ class IndexMeta { DT_INT4 = 6, DT_BINARY32 = 7, DT_BINARY64 = 8, + + // new data type for turboss + DT_ZVEC_FP16_ = 11, + DT_ZVEC_FP32 = 12, + DT_ZVEC_FP64 = 13, + DT_ZVEC_INT8 = 14, + DT_ZVEC_INT16 = 15, + DT_ZVEC_INT4 = 16, + DT_ZVEC_BINARY32 = 7, + DT_ZVEC_BINARY64 = 8, }; /*! Major Orders @@ -586,6 +596,7 @@ class IndexMeta { uint32_t dimension_{0}; uint32_t unit_size_{0}; uint32_t element_size_{0}; + uint32_t extra_meta_size_{0}; uint64_t space_id_{0}; uint32_t metric_revision_{0}; uint32_t converter_revision_{0}; From 517ce507e8c1dbea4c6b511a396e0375cadf2342 Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 24 Mar 2026 19:59:58 +0800 Subject: [PATCH 02/44] feat: turbo distances --- src/core/metric/quantized_integer_metric.cc | 7 + src/include/zvec/core/framework/index_meta.h | 16 +- src/include/zvec/turbo/turbo.h | 2 + src/turbo/CMakeLists.txt | 33 ++ src/turbo/avx2/half_float_converter/common.h | 34 ++ src/turbo/avx2/record_quantized_int4/common.h | 267 +++++++++++++++ .../avx2/record_quantized_int4/cosine.cc | 106 ++++++ src/turbo/avx2/record_quantized_int4/cosine.h | 30 ++ .../record_quantized_int4/inner_product.cc | 114 +++++++ .../record_quantized_int4/inner_product.h | 31 ++ .../squared_euclidean.cc | 49 +++ .../record_quantized_int4/squared_euclidean.h | 31 ++ src/turbo/avx512/float32/common.h | 34 ++ .../avx512/half_float_converter/common.h | 312 ++++++++++++++++++ .../avx512fp16/half_float_converter/common.h | 312 ++++++++++++++++++ src/turbo/sse/record_quantized_int4/common.h | 43 +++ src/turbo/sse/record_quantized_int4/cosine.cc | 53 +++ src/turbo/sse/record_quantized_int4/cosine.h | 34 ++ .../record_quantized_int4/inner_product.cc | 116 +++++++ .../sse/record_quantized_int4/inner_product.h | 32 ++ .../squared_euclidean.cc | 13 + .../record_quantized_int4/squared_euclidean.h | 15 + src/turbo/sse/record_quantized_int8/common.h | 33 ++ src/turbo/sse/record_quantized_int8/cosine.cc | 13 + src/turbo/sse/record_quantized_int8/cosine.h | 39 +++ .../record_quantized_int8/inner_product.cc | 13 + .../sse/record_quantized_int8/inner_product.h | 15 + .../squared_euclidean.cc | 134 ++++++++ .../record_quantized_int8/squared_euclidean.h | 41 +++ src/turbo/turbo.cc | 35 ++ 30 files changed, 1999 insertions(+), 8 deletions(-) create mode 100644 src/turbo/avx2/half_float_converter/common.h create mode 100644 src/turbo/avx2/record_quantized_int4/common.h create mode 100644 src/turbo/avx2/record_quantized_int4/cosine.cc create mode 100644 src/turbo/avx2/record_quantized_int4/cosine.h create mode 100644 src/turbo/avx2/record_quantized_int4/inner_product.cc create mode 100644 src/turbo/avx2/record_quantized_int4/inner_product.h create mode 100644 src/turbo/avx2/record_quantized_int4/squared_euclidean.cc create mode 100644 src/turbo/avx2/record_quantized_int4/squared_euclidean.h create mode 100644 src/turbo/avx512/float32/common.h create mode 100644 src/turbo/avx512/half_float_converter/common.h create mode 100644 src/turbo/avx512fp16/half_float_converter/common.h create mode 100644 src/turbo/sse/record_quantized_int4/common.h create mode 100644 src/turbo/sse/record_quantized_int4/cosine.cc create mode 100644 src/turbo/sse/record_quantized_int4/cosine.h create mode 100644 src/turbo/sse/record_quantized_int4/inner_product.cc create mode 100644 src/turbo/sse/record_quantized_int4/inner_product.h create mode 100644 src/turbo/sse/record_quantized_int4/squared_euclidean.cc create mode 100644 src/turbo/sse/record_quantized_int4/squared_euclidean.h create mode 100644 src/turbo/sse/record_quantized_int8/common.h create mode 100644 src/turbo/sse/record_quantized_int8/cosine.cc create mode 100644 src/turbo/sse/record_quantized_int8/cosine.h create mode 100644 src/turbo/sse/record_quantized_int8/inner_product.cc create mode 100644 src/turbo/sse/record_quantized_int8/inner_product.h create mode 100644 src/turbo/sse/record_quantized_int8/squared_euclidean.cc create mode 100644 src/turbo/sse/record_quantized_int8/squared_euclidean.h diff --git a/src/core/metric/quantized_integer_metric.cc b/src/core/metric/quantized_integer_metric.cc index e4db83146..8562a3c94 100644 --- a/src/core/metric/quantized_integer_metric.cc +++ b/src/core/metric/quantized_integer_metric.cc @@ -113,7 +113,14 @@ class QuantizedIntegerMetric : public IndexMetric { if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { return DistanceMatrixCompute(m, n); } + if (meta_.data_type() == IndexMeta::DataType::DT_INT4) { + auto turbo_ret = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault); + if (turbo_ret && m == 1 && n == 1) { + return turbo_ret; + } return DistanceMatrixCompute(m, n); } break; diff --git a/src/include/zvec/core/framework/index_meta.h b/src/include/zvec/core/framework/index_meta.h index 225b9d0da..451e14059 100644 --- a/src/include/zvec/core/framework/index_meta.h +++ b/src/include/zvec/core/framework/index_meta.h @@ -40,14 +40,14 @@ class IndexMeta { DT_BINARY64 = 8, // new data type for turboss - DT_ZVEC_FP16_ = 11, - DT_ZVEC_FP32 = 12, - DT_ZVEC_FP64 = 13, - DT_ZVEC_INT8 = 14, - DT_ZVEC_INT16 = 15, - DT_ZVEC_INT4 = 16, - DT_ZVEC_BINARY32 = 7, - DT_ZVEC_BINARY64 = 8, + // DT_ZVEC_FP16_ = 11, + // DT_ZVEC_FP32 = 12, + // DT_ZVEC_FP64 = 13, + // DT_ZVEC_INT8 = 14, + // DT_ZVEC_INT16 = 15, + // DT_ZVEC_INT4 = 16, + // DT_ZVEC_BINARY32 = 7, + // DT_ZVEC_BINARY64 = 8, }; /*! Major Orders diff --git a/src/include/zvec/turbo/turbo.h b/src/include/zvec/turbo/turbo.h index 6ecbfdd1e..f6054c7a8 100644 --- a/src/include/zvec/turbo/turbo.h +++ b/src/include/zvec/turbo/turbo.h @@ -28,11 +28,13 @@ using QueryPreprocessFunc = enum class MetricType { kSquaredEuclidean, kCosine, + kInnerProduct, kMipsSquaredEuclidean, kUnknown, }; enum class DataType { + kInt4, kInt8, kUnknown, }; diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index 3e2d0134f..6f7416c70 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -28,6 +28,39 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH) endif() endif() +if(NOT ANDROID AND AUTO_DETECT_ARCH) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") + file(GLOB_RECURSE AVX512_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512/*.cc) + set_source_files_properties( + ${AVX512_SRCS} + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}" + ) + endif() +endif() + +if(NOT ANDROID AND AUTO_DETECT_ARCH) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") + file(GLOB_RECURSE AVX2_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx2/*.cc) + set_source_files_properties( + ${AVX2_SRCS} + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX2}" + ) + endif() +endif() + +if(NOT ANDROID AND AUTO_DETECT_ARCH) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") + file(GLOB_RECURSE SSE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/sse/*.cc) + set_source_files_properties( + ${SSE_SRCS} + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_SSE}" + ) + endif() +endif() + cc_library( NAME zvec_turbo STATIC STRICT PACKED SRCS ${ALL_SRCS} diff --git a/src/turbo/avx2/half_float_converter/common.h b/src/turbo/avx2/half_float_converter/common.h new file mode 100644 index 000000000..4f11cc2a9 --- /dev/null +++ b/src/turbo/avx2/half_float_converter/common.h @@ -0,0 +1,34 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX2__) +#include +#include +#include + +namespace zvec::turbo::avx2::internal { + +} // namespace zvec::turbo::avx2::internal + +#endif // defined(__AVX2__) diff --git a/src/turbo/avx2/record_quantized_int4/common.h b/src/turbo/avx2/record_quantized_int4/common.h new file mode 100644 index 000000000..bd223e108 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/common.h @@ -0,0 +1,267 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX2__) +#include +#include +#include +#include + +namespace zvec::turbo::avx2::internal { + + +/*! Four-bits Integer Multiplication Table + */ +static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, + 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, + 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, + 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, + 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, + 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, + 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, + 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, + 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, + 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, + 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, + 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, + 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, + 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, +}; + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT4_GENERAL(m, q, sum) \ + sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \ + Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)]; + +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +#define MASK_INT4_SSE _mm_set1_epi32(0xf0f0f0f0) +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) + +#define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0) +#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001) + +static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}; + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable) + +//! Compute the distance between matrix and query +#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) \ + { \ + __m128i xmm_lhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE)); \ + __m128i xmm_rhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE)); \ + __m128i xmm_lhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE)); \ + __m128i xmm_rhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE)); \ + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); \ + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); \ + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); \ + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); \ + xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), \ + ONES_INT16_SSE); \ + xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), \ + ONES_INT16_SSE); \ + xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \ + } + +#define FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) \ + { \ + __m256i ymm_lhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_lhs), MASK_INT4_AVX)); \ + __m256i ymm_rhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_rhs), MASK_INT4_AVX)); \ + __m256i ymm_lhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_lhs), 4), MASK_INT4_AVX)); \ + __m256i ymm_rhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_rhs), 4), MASK_INT4_AVX)); \ + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); \ + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); \ + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); \ + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); \ + ymm_lhs_0 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), \ + ONES_INT16_AVX); \ + ymm_lhs_1 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), \ + ONES_INT16_AVX); \ + ymm_sum = \ + _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum); \ + } + +//! Compute the distance between matrix and query +static __attribute__((always_inline)) void ip_int4_avx2(const void *a, + const void *b, + size_t size, + float *distance) { + const uint8_t *lhs = reinterpret_cast(a); + const uint8_t *rhs = reinterpret_cast(b); + + const uint8_t *last = lhs + size; + const uint8_t *last_aligned = lhs + ((size >> 5) << 5); + __m256i ymm_sum = _mm256_setzero_si256(); + + if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + __m256i ymm_lhs = _mm256_load_si256((const __m256i *)(lhs)); + __m256i ymm_rhs = _mm256_load_si256((const __m256i *)(rhs)); + FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); + __m128i xmm_sum = _mm_setzero_si128(); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + ymm_sum = _mm256_add_epi32(_mm256_set_m128i(_mm_setzero_si128(), xmm_sum), + ymm_sum); + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)(lhs)); + __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)(rhs)); + FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); + __m128i xmm_sum = _mm_setzero_si128(); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + ymm_sum = _mm256_add_epi32(_mm256_set_m128i(_mm_setzero_si128(), xmm_sum), + ymm_sum); + lhs += 16; + rhs += 16; + } + } + float result = static_cast(HorizontalAdd_INT32_V256(ymm_sum)); + + switch (last - lhs) { + case 15: + FMA_INT4_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT4_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT4_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT4_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT4_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT4_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT4_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT4_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT4_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT4_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT4_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT4_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT4_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT4_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT4_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +// Compute raw integer inner products for a batch of int8 vectors against a +// single query. Uses AVX512-VNNI dpbusd instruction. +// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. +template +__attribute__((always_inline)) void ip_int4_batch_avx2_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) {} + +static __attribute__((always_inline)) void ip_int4_batch_avx2( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + ip_int4_batch_avx2_impl(query, &vectors[i], prefetch_ptrs, dim, + distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + ip_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, dim, + distances + i); + } +} + +} // namespace zvec::turbo::avx2::internal + +#endif // defined(__AVX2__) diff --git a/src/turbo/avx2/record_quantized_int4/cosine.cc b/src/turbo/avx2/record_quantized_int4/cosine.cc new file mode 100644 index 000000000..d40c8e7db --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/cosine.cc @@ -0,0 +1,106 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int4/cosine.h" +#include "avx2/record_quantized_int4/common.h" +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + const int original_dim = dim - 24; + if (original_dim <= 0) { + return; + } + + internal::ip_int4_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float ma = a_tail[0]; + float mb = a_tail[1]; + float ms = a_tail[2]; + + float qa = b_tail[0]; + float qb = b_tail[1]; + float qs = b_tail[2]; + + // Dequantize and compute cosine distance: + // cosine_dist = -(ma * qa * ip + mb * qa * qs + qb * ma * ms + // + original_dim * qb * mb) + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + static_cast(original_dim) * qb * mb); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX2__ +} + +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX2__) + // `dim` is the full encoded size; the original vector occupies dim-24 bytes. + const int original_dim = dim - 24; + if (original_dim <= 0) { + return; + } + + internal::ip_int4_batch_avx2(vectors, query, n, original_dim, distances); + + const float *q_tail = reinterpret_cast( + reinterpret_cast(query) + original_dim); + float qa = q_tail[0]; + float qb = q_tail[1]; + float qs = q_tail[2]; + + for (int i = 0; i < n; ++i) { + const float *m_tail = reinterpret_cast( + reinterpret_cast(vectors[i]) + original_dim); + float ma = m_tail[0]; + float mb = m_tail[1]; + float ms = m_tail[2]; + // Correct for the +128 shift applied to the query during preprocessing: + // dpbusd computes sum(uint8_query[i] * int8_data[i]) + // = sum((int8_query[i] + 128) * int8_data[i]) + // = true_ip + 128 * sum(int8_data[i]) + // int8_sum is stored as the 5th int-sized field after the 4 floats. + int int8_sum = reinterpret_cast(m_tail)[4]; + float &result = distances[i]; + result -= 128.0f * static_cast(int8_sum); + + // Dequantize and compute cosine distance: + // cosine_dist = -(ma * qa * ip + mb * qa * qs + qb * ma * ms + // + original_dim * qb * mb) + result = -(ma * qa * result + mb * qa * qs + qb * ma * ms + + static_cast(original_dim) * qb * mb); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int4/cosine.h b/src/turbo/avx2/record_quantized_int4/cosine.h new file mode 100644 index 000000000..77b4adad9 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized INT4 vector pair. +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int4_distance. +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int4/inner_product.cc b/src/turbo/avx2/record_quantized_int4/inner_product.cc new file mode 100644 index 000000000..9dc36e6d6 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/inner_product.cc @@ -0,0 +1,114 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int4/inner_product.h" +#include "avx2/record_quantized_int4/common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +// Compute squared Euclidean distance between a single quantized INT4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::ip_int4_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + float qs2 = a_tail[3]; + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + float ms2 = b_tail[3]; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif //__AVX2__ +} + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX2__) + const int original_dim = dim - 24; + if (original_dim <= 0) { + return; + } + + internal::ip_int4_batch_avx2(vectors, query, n, original_dim, distances); + + const float *q_tail = reinterpret_cast( + reinterpret_cast(query) + original_dim); + float qa = q_tail[0]; + float qb = q_tail[1]; + float qs = q_tail[2]; + + for (int i = 0; i < n; ++i) { + const float *m_tail = reinterpret_cast( + reinterpret_cast(vectors[i]) + original_dim); + float ma = m_tail[0]; + float mb = m_tail[1]; + float ms = m_tail[2]; + // Correct for the +128 shift applied to the query during preprocessing: + // dpbusd computes sum(uint8_query[i] * int8_data[i]) + // = sum((int8_query[i] + 128) * int8_data[i]) + // = true_ip + 128 * sum(int8_data[i]) + // int8_sum is stored as the 5th int-sized field after the 4 floats. + int int8_sum = reinterpret_cast(m_tail)[4]; + float &result = distances[i]; + result -= 128.0f * static_cast(int8_sum); + + // Dequantize and compute cosine distance: + // cosine_dist = -(ma * qa * ip + mb * qa * qs + qb * ma * ms + // + original_dim * qb * mb) + result = -(ma * qa * result + mb * qa * qs + qb * ma * ms + + static_cast(original_dim) * qb * mb); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int4/inner_product.h b/src/turbo/avx2/record_quantized_int4/inner_product.h new file mode 100644 index 000000000..0e9e69d63 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute inner product distance between a single quantized INT4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx2 diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc new file mode 100644 index 000000000..676e62aae --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc @@ -0,0 +1,49 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int4/common.h" +#include "avx2/record_quantized_int4/cosine.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX2__ +} + +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX2__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean.h b/src/turbo/avx2/record_quantized_int4/squared_euclidean.h new file mode 100644 index 000000000..b6d15f698 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute squared euclidean distance between a single quantized INT4 +// vector pair. +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean INT4. +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx2 diff --git a/src/turbo/avx512/float32/common.h b/src/turbo/avx512/float32/common.h new file mode 100644 index 000000000..35dbf1f08 --- /dev/null +++ b/src/turbo/avx512/float32/common.h @@ -0,0 +1,34 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX512VNNI__) +#include +#include +#include + +namespace zvec::turbo::avx512_vnni::internal { + +} // namespace zvec::turbo::avx512_vnni::internal + +#endif // defined(__AVX512VNNI__) diff --git a/src/turbo/avx512/half_float_converter/common.h b/src/turbo/avx512/half_float_converter/common.h new file mode 100644 index 000000000..55fb5898c --- /dev/null +++ b/src/turbo/avx512/half_float_converter/common.h @@ -0,0 +1,312 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX512VNNI__) +#include +#include +#include + +namespace zvec::turbo::avx512_vnni::internal { + +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast(m * q); + +// Compute the raw integer inner product of two int8 vectors of length `size`. +// The result is written to `*distance` as a float. +// Both `a` and `b` must point to int8_t arrays. +static __attribute__((always_inline)) void ip_int8_avx512_vnni( + const void *a, const void *b, size_t size, float *distance) { + const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001); + const __m128i ONES_INT16_SSE = _mm_set1_epi32(0x00010001); + + const int8_t *lhs = reinterpret_cast(a); + const int8_t *rhs = reinterpret_cast(b); + + const int8_t *last = lhs + size; + const int8_t *last_aligned = lhs + ((size >> 6) << 6); + + float result = 0.0f; + + __m256i ymm_sum_0 = _mm256_setzero_si256(); + __m256i ymm_sum_1 = _mm256_setzero_si256(); + + if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m256i ymm_lhs_0 = _mm256_load_si256((const __m256i *)(lhs + 0)); + __m256i ymm_lhs_1 = _mm256_load_si256((const __m256i *)(lhs + 32)); + __m256i ymm_rhs_0 = _mm256_load_si256((const __m256i *)(rhs + 0)); + __m256i ymm_rhs_1 = _mm256_load_si256((const __m256i *)(rhs + 32)); + + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); + + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), + ONES_INT16_AVX), + ymm_sum_0); + ymm_sum_1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), + ONES_INT16_AVX), + ymm_sum_1); + } + + if (last >= last_aligned + 32) { + __m256i ymm_lhs = _mm256_load_si256((const __m256i *)lhs); + __m256i ymm_rhs = _mm256_load_si256((const __m256i *)rhs); + ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); + ymm_rhs = _mm256_abs_epi8(ymm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), + ONES_INT16_AVX), + ymm_sum_0); + lhs += 32; + rhs += 32; + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_set_m128i(_mm_setzero_si128(), + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), + ONES_INT16_SSE)), + ymm_sum_0); + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m256i ymm_lhs_0 = _mm256_loadu_si256((const __m256i *)(lhs + 0)); + __m256i ymm_lhs_1 = _mm256_loadu_si256((const __m256i *)(lhs + 32)); + __m256i ymm_rhs_0 = _mm256_loadu_si256((const __m256i *)(rhs + 0)); + __m256i ymm_rhs_1 = _mm256_loadu_si256((const __m256i *)(rhs + 32)); + + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); + + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), + ONES_INT16_AVX), + ymm_sum_0); + ymm_sum_1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), + ONES_INT16_AVX), + ymm_sum_1); + } + + if (last >= last_aligned + 32) { + __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)lhs); + __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)rhs); + ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); + ymm_rhs = _mm256_abs_epi8(ymm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), + ONES_INT16_AVX), + ymm_sum_0); + lhs += 32; + rhs += 32; + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_set_m128i(_mm_setzero_si128(), + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), + ONES_INT16_SSE)), + ymm_sum_0); + lhs += 16; + rhs += 16; + } + } + result = static_cast( + HorizontalAdd_INT32_V256(_mm256_add_epi32(ymm_sum_0, ymm_sum_1))); + + switch (last - lhs) { + case 15: + FMA_INT8_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT8_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT8_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT8_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT8_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT8_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT8_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT8_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT8_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT8_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT8_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT8_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT8_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT8_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT8_GENERAL(lhs[0], rhs[0], result) + } + *distance = result; +} + +#undef FMA_INT8_GENERAL + +// Shift the first `original_dim` bytes of `query` in-place from int8 to uint8 +// by adding 128 to each element. The metadata tail beyond `original_dim` is +// left untouched. This prepares the query for use with dpbusd (uint8 * int8). +static __attribute__((always_inline)) void shift_int8_to_uint8_avx512( + void *query, size_t original_dim) { + const int8_t *input = reinterpret_cast(query); + uint8_t *output = reinterpret_cast(query); + + // 128 represented as int8_t wraps to -128, but two's complement addition + // produces the correct uint8 result. + const __m512i offset = _mm512_set1_epi8(static_cast(128)); + + size_t i = 0; + for (; i + 64 <= original_dim; i += 64) { + __m512i data = + _mm512_loadu_si512(reinterpret_cast(input + i)); + __m512i shifted = _mm512_add_epi8(data, offset); + _mm512_storeu_si512(reinterpret_cast<__m512i *>(output + i), shifted); + } + for (; i < original_dim; ++i) { + output[i] = static_cast(static_cast(input[i]) + 128); + } +} + +// Compute raw integer inner products for a batch of int8 vectors against a +// single query. Uses AVX512-VNNI dpbusd instruction. +// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. +template +__attribute__((always_inline)) void ip_int8_batch_avx512_vnni_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + __m512i accs[batch_size]; + for (size_t i = 0; i < batch_size; ++i) { + accs[i] = _mm512_setzero_si512(); + } + size_t dim = 0; + for (; dim + 64 <= dimensionality; dim += 64) { + __m512i q = _mm512_loadu_si512(reinterpret_cast( + reinterpret_cast(query) + dim)); + __m512i data_regs[batch_size]; + for (size_t i = 0; i < batch_size; ++i) { + data_regs[i] = _mm512_loadu_si512(reinterpret_cast( + reinterpret_cast(vectors[i]) + dim)); + } + for (size_t i = 0; i < batch_size; ++i) { + if (prefetch_ptrs[i]) { + _mm_prefetch( + reinterpret_cast( + reinterpret_cast(prefetch_ptrs[i]) + dim), + _MM_HINT_T0); + } + accs[i] = _mm512_dpbusd_epi32(accs[i], q, data_regs[i]); + } + } + std::array temp_results{}; + for (size_t i = 0; i < batch_size; ++i) { + temp_results[i] = _mm512_reduce_add_epi32(accs[i]); + } + for (; dim < dimensionality; ++dim) { + int q = static_cast(reinterpret_cast(query)[dim]); + for (size_t i = 0; i < batch_size; ++i) { + temp_results[i] += + q * + static_cast(reinterpret_cast(vectors[i])[dim]); + } + } + for (size_t i = 0; i < batch_size; ++i) { + distances[i] = static_cast(temp_results[i]); + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void ip_int8_batch_avx512_vnni( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + ip_int8_batch_avx512_vnni_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + ip_int8_batch_avx512_vnni_impl<1>(query, &vectors[i], prefetch_ptrs, dim, + distances + i); + } +} + +} // namespace zvec::turbo::avx512_vnni::internal + +#endif // defined(__AVX512VNNI__) diff --git a/src/turbo/avx512fp16/half_float_converter/common.h b/src/turbo/avx512fp16/half_float_converter/common.h new file mode 100644 index 000000000..55fb5898c --- /dev/null +++ b/src/turbo/avx512fp16/half_float_converter/common.h @@ -0,0 +1,312 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX512VNNI__) +#include +#include +#include + +namespace zvec::turbo::avx512_vnni::internal { + +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast(m * q); + +// Compute the raw integer inner product of two int8 vectors of length `size`. +// The result is written to `*distance` as a float. +// Both `a` and `b` must point to int8_t arrays. +static __attribute__((always_inline)) void ip_int8_avx512_vnni( + const void *a, const void *b, size_t size, float *distance) { + const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001); + const __m128i ONES_INT16_SSE = _mm_set1_epi32(0x00010001); + + const int8_t *lhs = reinterpret_cast(a); + const int8_t *rhs = reinterpret_cast(b); + + const int8_t *last = lhs + size; + const int8_t *last_aligned = lhs + ((size >> 6) << 6); + + float result = 0.0f; + + __m256i ymm_sum_0 = _mm256_setzero_si256(); + __m256i ymm_sum_1 = _mm256_setzero_si256(); + + if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m256i ymm_lhs_0 = _mm256_load_si256((const __m256i *)(lhs + 0)); + __m256i ymm_lhs_1 = _mm256_load_si256((const __m256i *)(lhs + 32)); + __m256i ymm_rhs_0 = _mm256_load_si256((const __m256i *)(rhs + 0)); + __m256i ymm_rhs_1 = _mm256_load_si256((const __m256i *)(rhs + 32)); + + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); + + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), + ONES_INT16_AVX), + ymm_sum_0); + ymm_sum_1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), + ONES_INT16_AVX), + ymm_sum_1); + } + + if (last >= last_aligned + 32) { + __m256i ymm_lhs = _mm256_load_si256((const __m256i *)lhs); + __m256i ymm_rhs = _mm256_load_si256((const __m256i *)rhs); + ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); + ymm_rhs = _mm256_abs_epi8(ymm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), + ONES_INT16_AVX), + ymm_sum_0); + lhs += 32; + rhs += 32; + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_set_m128i(_mm_setzero_si128(), + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), + ONES_INT16_SSE)), + ymm_sum_0); + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m256i ymm_lhs_0 = _mm256_loadu_si256((const __m256i *)(lhs + 0)); + __m256i ymm_lhs_1 = _mm256_loadu_si256((const __m256i *)(lhs + 32)); + __m256i ymm_rhs_0 = _mm256_loadu_si256((const __m256i *)(rhs + 0)); + __m256i ymm_rhs_1 = _mm256_loadu_si256((const __m256i *)(rhs + 32)); + + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); + + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), + ONES_INT16_AVX), + ymm_sum_0); + ymm_sum_1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), + ONES_INT16_AVX), + ymm_sum_1); + } + + if (last >= last_aligned + 32) { + __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)lhs); + __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)rhs); + ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); + ymm_rhs = _mm256_abs_epi8(ymm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), + ONES_INT16_AVX), + ymm_sum_0); + lhs += 32; + rhs += 32; + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_set_m128i(_mm_setzero_si128(), + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), + ONES_INT16_SSE)), + ymm_sum_0); + lhs += 16; + rhs += 16; + } + } + result = static_cast( + HorizontalAdd_INT32_V256(_mm256_add_epi32(ymm_sum_0, ymm_sum_1))); + + switch (last - lhs) { + case 15: + FMA_INT8_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT8_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT8_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT8_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT8_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT8_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT8_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT8_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT8_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT8_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT8_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT8_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT8_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT8_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT8_GENERAL(lhs[0], rhs[0], result) + } + *distance = result; +} + +#undef FMA_INT8_GENERAL + +// Shift the first `original_dim` bytes of `query` in-place from int8 to uint8 +// by adding 128 to each element. The metadata tail beyond `original_dim` is +// left untouched. This prepares the query for use with dpbusd (uint8 * int8). +static __attribute__((always_inline)) void shift_int8_to_uint8_avx512( + void *query, size_t original_dim) { + const int8_t *input = reinterpret_cast(query); + uint8_t *output = reinterpret_cast(query); + + // 128 represented as int8_t wraps to -128, but two's complement addition + // produces the correct uint8 result. + const __m512i offset = _mm512_set1_epi8(static_cast(128)); + + size_t i = 0; + for (; i + 64 <= original_dim; i += 64) { + __m512i data = + _mm512_loadu_si512(reinterpret_cast(input + i)); + __m512i shifted = _mm512_add_epi8(data, offset); + _mm512_storeu_si512(reinterpret_cast<__m512i *>(output + i), shifted); + } + for (; i < original_dim; ++i) { + output[i] = static_cast(static_cast(input[i]) + 128); + } +} + +// Compute raw integer inner products for a batch of int8 vectors against a +// single query. Uses AVX512-VNNI dpbusd instruction. +// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. +template +__attribute__((always_inline)) void ip_int8_batch_avx512_vnni_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + __m512i accs[batch_size]; + for (size_t i = 0; i < batch_size; ++i) { + accs[i] = _mm512_setzero_si512(); + } + size_t dim = 0; + for (; dim + 64 <= dimensionality; dim += 64) { + __m512i q = _mm512_loadu_si512(reinterpret_cast( + reinterpret_cast(query) + dim)); + __m512i data_regs[batch_size]; + for (size_t i = 0; i < batch_size; ++i) { + data_regs[i] = _mm512_loadu_si512(reinterpret_cast( + reinterpret_cast(vectors[i]) + dim)); + } + for (size_t i = 0; i < batch_size; ++i) { + if (prefetch_ptrs[i]) { + _mm_prefetch( + reinterpret_cast( + reinterpret_cast(prefetch_ptrs[i]) + dim), + _MM_HINT_T0); + } + accs[i] = _mm512_dpbusd_epi32(accs[i], q, data_regs[i]); + } + } + std::array temp_results{}; + for (size_t i = 0; i < batch_size; ++i) { + temp_results[i] = _mm512_reduce_add_epi32(accs[i]); + } + for (; dim < dimensionality; ++dim) { + int q = static_cast(reinterpret_cast(query)[dim]); + for (size_t i = 0; i < batch_size; ++i) { + temp_results[i] += + q * + static_cast(reinterpret_cast(vectors[i])[dim]); + } + } + for (size_t i = 0; i < batch_size; ++i) { + distances[i] = static_cast(temp_results[i]); + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void ip_int8_batch_avx512_vnni( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + ip_int8_batch_avx512_vnni_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + ip_int8_batch_avx512_vnni_impl<1>(query, &vectors[i], prefetch_ptrs, dim, + distances + i); + } +} + +} // namespace zvec::turbo::avx512_vnni::internal + +#endif // defined(__AVX512VNNI__) diff --git a/src/turbo/sse/record_quantized_int4/common.h b/src/turbo/sse/record_quantized_int4/common.h new file mode 100644 index 000000000..c47294eb6 --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/common.h @@ -0,0 +1,43 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__SSE4_1__) +#include +#include +#include + +namespace zvec::turbo::sse::internal { + +static __attribute__((always_inline)) void ip_int4_sse(const void *a, + const void *b, + size_t size, + float *distance) {} + +static __attribute__((always_inline)) void ip_int4_batch_sse( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) {} + +} // namespace zvec::turbo::sse::internal + +#endif // defined(__SSE4_1__) diff --git a/src/turbo/sse/record_quantized_int4/cosine.cc b/src/turbo/sse/record_quantized_int4/cosine.cc new file mode 100644 index 000000000..f041bfe80 --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/cosine.cc @@ -0,0 +1,53 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sse/record_quantized_int4/cosine.h" +#include "sse/record_quantized_int4/common.h" +#if defined(__SSE4_1__) +#include +#endif + +namespace zvec::turbo::sse { + +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE4_1__) + // `dim` is the full encoded size; the original vector occupies dim-24 bytes. + const int original_dim = dim - 24; + if (original_dim <= 0) { + return; + } + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __SSE__ +} + +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__SSE4_1__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__SSE4_1__ +} + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/cosine.h b/src/turbo/sse/record_quantized_int4/cosine.h new file mode 100644 index 000000000..bab173eca --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/cosine.h @@ -0,0 +1,34 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized INT8 vector pair. +// `dim` includes the original vector bytes plus a 24-byte metadata tail +// (3 floats: scale_a, bias_a, sum_a). +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int8_distance. +// The query must have been preprocessed by cosine_int8_query_preprocess +// (int8 -> uint8 via + 128 shift) before calling this function. +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/inner_product.cc b/src/turbo/sse/record_quantized_int4/inner_product.cc new file mode 100644 index 000000000..e8ef5df7c --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/inner_product.cc @@ -0,0 +1,116 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sse/record_quantized_int4/inner_product.h" +#include "sse/record_quantized_int4/common.h" + +#if defined(__SSE4_1__) +#include +#endif + +namespace zvec::turbo::sse { + +// Compute squared Euclidean distance between a single quantized INT4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE4_1__) + // `dim` is the full encoded size; the original vector occupies dim-24 bytes. + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::ip_int4_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + float qs2 = a_tail[3]; + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + float ms2 = b_tail[3]; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif +} + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__SSE4_1__) + // `dim` is the full encoded size; the original vector occupies dim-24 bytes. + const int original_dim = dim - 24; + if (original_dim <= 0) { + return; + } + + internal::ip_int4_batch_sse(vectors, query, n, original_dim, distances); + + const float *q_tail = reinterpret_cast( + reinterpret_cast(query) + original_dim); + float qa = q_tail[0]; + float qb = q_tail[1]; + float qs = q_tail[2]; + + for (int i = 0; i < n; ++i) { + const float *m_tail = reinterpret_cast( + reinterpret_cast(vectors[i]) + original_dim); + float ma = m_tail[0]; + float mb = m_tail[1]; + float ms = m_tail[2]; + // Correct for the +128 shift applied to the query during preprocessing: + // dpbusd computes sum(uint8_query[i] * int8_data[i]) + // = sum((int8_query[i] + 128) * int8_data[i]) + // = true_ip + 128 * sum(int8_data[i]) + // int8_sum is stored as the 5th int-sized field after the 4 floats. + int int8_sum = reinterpret_cast(m_tail)[4]; + float &result = distances[i]; + result -= 128.0f * static_cast(int8_sum); + + // Dequantize and compute cosine distance: + // cosine_dist = -(ma * qa * ip + mb * qa * qs + qb * ma * ms + // + original_dim * qb * mb) + result = -(ma * qa * result + mb * qa * qs + qb * ma * ms + + static_cast(original_dim) * qb * mb); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif // __SSE4_1__ +} + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/inner_product.h b/src/turbo/sse/record_quantized_int4/inner_product.h new file mode 100644 index 000000000..8a6ee015c --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/inner_product.h @@ -0,0 +1,32 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + + +#include + +namespace zvec::turbo::sse { + +// Compute squared Euclidean distance between a single quantized INT4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::sse diff --git a/src/turbo/sse/record_quantized_int4/squared_euclidean.cc b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc new file mode 100644 index 000000000..22447509b --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc @@ -0,0 +1,13 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/src/turbo/sse/record_quantized_int4/squared_euclidean.h b/src/turbo/sse/record_quantized_int4/squared_euclidean.h new file mode 100644 index 000000000..a0b74ecbf --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/squared_euclidean.h @@ -0,0 +1,15 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once diff --git a/src/turbo/sse/record_quantized_int8/common.h b/src/turbo/sse/record_quantized_int8/common.h new file mode 100644 index 000000000..cb9727491 --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/common.h @@ -0,0 +1,33 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__SSE__) +#include + +namespace zvec::turbo::avx512_vnni::sse { + + +} // namespace zvec::turbo::avx512_vnni::sse + +#endif // defined(__SSE__) diff --git a/src/turbo/sse/record_quantized_int8/cosine.cc b/src/turbo/sse/record_quantized_int8/cosine.cc new file mode 100644 index 000000000..22447509b --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/cosine.cc @@ -0,0 +1,13 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/src/turbo/sse/record_quantized_int8/cosine.h b/src/turbo/sse/record_quantized_int8/cosine.h new file mode 100644 index 000000000..5fb491eab --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/cosine.h @@ -0,0 +1,39 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized INT8 vector pair. +// `dim` includes the original vector bytes plus a 24-byte metadata tail +// (3 floats: scale_a, bias_a, sum_a). +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int8_distance. +// The query must have been preprocessed by cosine_int8_query_preprocess +// (int8 -> uint8 via +128 shift) before calling this function. +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +// Preprocess the query vector in-place (shift int8 -> uint8 by adding 128) +// so that the AVX512-VNNI dpbusd instruction can be used for inner product. +// `dim` includes the 24-byte metadata tail. +void cosine_int8_query_preprocess(void *query, size_t dim); + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/inner_product.cc b/src/turbo/sse/record_quantized_int8/inner_product.cc new file mode 100644 index 000000000..22447509b --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/inner_product.cc @@ -0,0 +1,13 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/src/turbo/sse/record_quantized_int8/inner_product.h b/src/turbo/sse/record_quantized_int8/inner_product.h new file mode 100644 index 000000000..a0b74ecbf --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/inner_product.h @@ -0,0 +1,15 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once diff --git a/src/turbo/sse/record_quantized_int8/squared_euclidean.cc b/src/turbo/sse/record_quantized_int8/squared_euclidean.cc new file mode 100644 index 000000000..b9b8f23ef --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/squared_euclidean.cc @@ -0,0 +1,134 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512_vnni/record_quantized_int8/squared_euclidean.h" +#include "avx512_vnni/record_quantized_int8/common.h" +#if defined(__AVX512VNNI__) +#include +#endif + +// Tail layout for quantized INT8 squared Euclidean vectors: +// +// [ original_dim bytes: int8_t elements ] +// [ float scale_a ] (ma) +// [ float bias_a ] (mb) +// [ float sum_a ] (ms) +// [ float sum2_a ] (ms2) +// [ int int8_sum ] (sum of raw int8 elements, used for bias correction +// when the query has been shifted to uint8 via +128) +// +// Total tail size: 4 floats + 1 int = 20 bytes, so dim = original_dim + 20. + +namespace zvec::turbo::avx512_vnni { + +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512VNNI__) + const int original_dim = dim - 20; + if (original_dim <= 0) { + return; + } + internal::ip_int8_avx512_vnni(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float ma = a_tail[0]; + float mb = a_tail[1]; + float ms = a_tail[2]; + float ms2 = a_tail[3]; + + float qa = b_tail[0]; + float qb = b_tail[1]; + float qs = b_tail[2]; + float qs2 = b_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * original_dim + + 2 * (mb - qb) * (ms * ma - sum); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif +} + +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX512VNNI__) + const int original_dim = dim - 20; + if (original_dim <= 0) { + return; + } + + internal::ip_int8_batch_avx512_vnni(vectors, query, n, original_dim, + distances); + const float *q_tail = reinterpret_cast( + reinterpret_cast(query) + original_dim); + float qa = q_tail[0]; + float qb = q_tail[1]; + float qs = q_tail[2]; + float qs2 = q_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + for (size_t i = 0; i < n; ++i) { + const float *m_tail = reinterpret_cast( + reinterpret_cast(vectors[i]) + original_dim); + float ma = m_tail[0]; + float mb = m_tail[1]; + float ms = m_tail[2]; + float ms2 = m_tail[3]; + // Correct for the +128 shift applied to the query during preprocessing: + // dpbusd computes sum(uint8_query[i] * int8_data[i]) + // = sum((int8_query[i] + 128) * int8_data[i]) + // = true_ip + 128 * sum(int8_data[i]) + // int8_sum is stored as the 5th int-sized field after the 4 floats. + int int8_sum = reinterpret_cast(m_tail)[4]; + float &result = distances[i]; + result -= 128.0f * static_cast(int8_sum); + result = ma * ma * ms2 + sum2 - 2 * ma * qa * result + + (mb - qb) * (mb - qb) * original_dim + + 2 * (mb - qb) * (ms * ma - sum); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif +} + +void squared_euclidean_int8_query_preprocess(void *query, size_t dim) { +#if defined(__AVX512VNNI__) + const int original_dim = static_cast(dim) - 20; + if (original_dim <= 0) { + return; + } + internal::shift_int8_to_uint8_avx512(query, original_dim); +#else + (void)query; + (void)dim; +#endif +} + +} // namespace zvec::turbo::avx512_vnni diff --git a/src/turbo/sse/record_quantized_int8/squared_euclidean.h b/src/turbo/sse/record_quantized_int8/squared_euclidean.h new file mode 100644 index 000000000..1e2cf45b4 --- /dev/null +++ b/src/turbo/sse/record_quantized_int8/squared_euclidean.h @@ -0,0 +1,41 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute squared Euclidean distance between a single quantized INT8 +// vector pair. +// `dim` includes the original vector bytes plus a 20-byte metadata tail +// (4 floats: scale_a, bias_a, sum_a, sum2_a). +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared_euclidean_int8_distance. +// The query must have been preprocessed by +// squared_euclidean_int8_query_preprocess (int8 -> uint8 via +128 shift) +// before calling this function. +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +// Preprocess the query vector in-place (shift int8 -> uint8 by adding 128) +// for the batch path. Only the original_dim bytes are shifted; the metadata +// tail is left intact. `dim` includes the 20-byte metadata tail. +void squared_euclidean_int8_query_preprocess(void *query, size_t dim); + +} // namespace zvec::turbo::sse diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index a731cfed1..5f3c3cb07 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -14,6 +14,9 @@ #include #include +#include "avx2/record_quantized_int4/cosine.h" +#include "avx2/record_quantized_int4/inner_product.h" +#include "avx2/record_quantized_int4/squared_euclidean.h" #include "avx512_vnni/record_quantized_int8/cosine.h" #include "avx512_vnni/record_quantized_int8/squared_euclidean.h" @@ -33,6 +36,21 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, } } } + if (data_type == DataType::kInt4) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx2::squared_euclidean_int4_distance; + } + if (metric_type == MetricType::kCosine) { + return avx2::cosine_int4_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx2::inner_product_int4_distance; + } + } + } + } return nullptr; } @@ -51,6 +69,23 @@ BatchDistanceFunc get_batch_distance_func(MetricType metric_type, } } } + + if (data_type == DataType::kInt4) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx2::squared_euclidean_int4_batch_distance; + } + if (metric_type == MetricType::kCosine) { + return avx2::cosine_int4_batch_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx2::inner_product_int4_batch_distance; + } + } + } + } + return nullptr; } From 51cc10e95c6ca5c7079804d2bf2adabddc4006c5 Mon Sep 17 00:00:00 2001 From: ray Date: Wed, 25 Mar 2026 14:36:17 +0800 Subject: [PATCH 03/44] refactor: fix int4 ip --- .../avx2/record_quantized_int4/cosine.cc | 2 +- .../record_quantized_int4/inner_product.cc | 10 +- .../{common.h => inner_product_common.h} | 61 ++-- .../squared_euclidean.cc | 4 +- .../squared_euclidean_common.h | 260 ++++++++++++++++++ .../metric/quantized_integer_metric_test.cc | 43 +-- 6 files changed, 308 insertions(+), 72 deletions(-) rename src/turbo/avx2/record_quantized_int4/{common.h => inner_product_common.h} (87%) create mode 100644 src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h diff --git a/src/turbo/avx2/record_quantized_int4/cosine.cc b/src/turbo/avx2/record_quantized_int4/cosine.cc index d40c8e7db..7a15876d1 100644 --- a/src/turbo/avx2/record_quantized_int4/cosine.cc +++ b/src/turbo/avx2/record_quantized_int4/cosine.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "avx2/record_quantized_int4/cosine.h" -#include "avx2/record_quantized_int4/common.h" +#include "avx2/record_quantized_int4/inner_product_common.h" #if defined(__AVX2__) #include #endif diff --git a/src/turbo/avx2/record_quantized_int4/inner_product.cc b/src/turbo/avx2/record_quantized_int4/inner_product.cc index 9dc36e6d6..fdb25f9a5 100644 --- a/src/turbo/avx2/record_quantized_int4/inner_product.cc +++ b/src/turbo/avx2/record_quantized_int4/inner_product.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "avx2/record_quantized_int4/inner_product.h" -#include "avx2/record_quantized_int4/common.h" +#include "avx2/record_quantized_int4/inner_product_common.h" #if defined(__AVX2__) #include @@ -43,17 +43,13 @@ void inner_product_int4_distance(const void *a, const void *b, size_t dim, float qa = a_tail[0]; float qb = a_tail[1]; float qs = a_tail[2]; - float qs2 = a_tail[3]; - const float sum = qa * qs; - const float sum2 = qa * qa * qs2; float ma = b_tail[0]; float mb = b_tail[1]; float ms = b_tail[2]; - float ms2 = b_tail[3]; - *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + - (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); + *distance = + -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + d * qb * mb); #else (void)a; diff --git a/src/turbo/avx2/record_quantized_int4/common.h b/src/turbo/avx2/record_quantized_int4/inner_product_common.h similarity index 87% rename from src/turbo/avx2/record_quantized_int4/common.h rename to src/turbo/avx2/record_quantized_int4/inner_product_common.h index bd223e108..bec7f61b2 100644 --- a/src/turbo/avx2/record_quantized_int4/common.h +++ b/src/turbo/avx2/record_quantized_int4/inner_product_common.h @@ -65,7 +65,7 @@ static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { return _mm_cvtsi128_si32(x4); } -#define MASK_INT4_SSE _mm_set1_epi32(0xf0f0f0f0) +#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f) #define ONES_INT16_SSE _mm_set1_epi32(0x00010001) #define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0) @@ -129,6 +129,22 @@ static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum); \ } +#if defined(__SSE2__) +static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { +#ifdef __SSE3__ + __m128i x1 = _mm_hadd_epi32(v, v); + __m128i x2 = _mm_hadd_epi32(x1, x1); + return _mm_cvtsi128_si32(x2); +#else + __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); + __m128i x2 = _mm_add_epi32(v, x1); + __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); + __m128i x4 = _mm_add_epi32(x2, x3); + return _mm_cvtsi128_si32(x4); +#endif +} +#endif // __SSE2__ + //! Compute the distance between matrix and query static __attribute__((always_inline)) void ip_int4_avx2(const void *a, const void *b, @@ -136,47 +152,24 @@ static __attribute__((always_inline)) void ip_int4_avx2(const void *a, float *distance) { const uint8_t *lhs = reinterpret_cast(a); const uint8_t *rhs = reinterpret_cast(b); - const uint8_t *last = lhs + size; - const uint8_t *last_aligned = lhs + ((size >> 5) << 5); - __m256i ymm_sum = _mm256_setzero_si256(); + const uint8_t *last_aligned = lhs + ((size >> 4) << 4); + __m128i xmm_sum = _mm_setzero_si128(); - if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { - for (; lhs != last_aligned; lhs += 32, rhs += 32) { - __m256i ymm_lhs = _mm256_load_si256((const __m256i *)(lhs)); - __m256i ymm_rhs = _mm256_load_si256((const __m256i *)(rhs)); - FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) - } - - if (last >= lhs + 16) { - __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); - __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); - __m128i xmm_sum = _mm_setzero_si128(); + if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs)); FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) - ymm_sum = _mm256_add_epi32(_mm256_set_m128i(_mm_setzero_si128(), xmm_sum), - ymm_sum); - lhs += 16; - rhs += 16; } } else { - for (; lhs != last_aligned; lhs += 32, rhs += 32) { - __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)(lhs)); - __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)(rhs)); - FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) - } - - if (last >= lhs + 16) { - __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); - __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); - __m128i xmm_sum = _mm_setzero_si128(); + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs)); FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) - ymm_sum = _mm256_add_epi32(_mm256_set_m128i(_mm_setzero_si128(), xmm_sum), - ymm_sum); - lhs += 16; - rhs += 16; } } - float result = static_cast(HorizontalAdd_INT32_V256(ymm_sum)); + float result = static_cast(HorizontalAdd_INT32_V128(xmm_sum)); switch (last - lhs) { case 15: diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc index 676e62aae..1454955c9 100644 --- a/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc +++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx2/record_quantized_int4/common.h" -#include "avx2/record_quantized_int4/cosine.h" +#include "avx2/record_quantized_int4/squared_euclidean.h" +#include "avx2/record_quantized_int4/squared_euclidean_common.h" #if defined(__AVX2__) #include diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h b/src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h new file mode 100644 index 000000000..bec7f61b2 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h @@ -0,0 +1,260 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX2__) +#include +#include +#include +#include + +namespace zvec::turbo::avx2::internal { + + +/*! Four-bits Integer Multiplication Table + */ +static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, + 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, + 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, + 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, + 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, + 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, + 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, + 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, + 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, + 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, + 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, + 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, + 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, + 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, +}; + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT4_GENERAL(m, q, sum) \ + sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \ + Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)]; + +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f) +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) + +#define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0) +#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001) + +static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}; + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable) + +//! Compute the distance between matrix and query +#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) \ + { \ + __m128i xmm_lhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE)); \ + __m128i xmm_rhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE)); \ + __m128i xmm_lhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE)); \ + __m128i xmm_rhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE)); \ + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); \ + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); \ + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); \ + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); \ + xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), \ + ONES_INT16_SSE); \ + xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), \ + ONES_INT16_SSE); \ + xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \ + } + +#define FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) \ + { \ + __m256i ymm_lhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_lhs), MASK_INT4_AVX)); \ + __m256i ymm_rhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_rhs), MASK_INT4_AVX)); \ + __m256i ymm_lhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_lhs), 4), MASK_INT4_AVX)); \ + __m256i ymm_rhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_rhs), 4), MASK_INT4_AVX)); \ + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); \ + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); \ + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); \ + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); \ + ymm_lhs_0 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), \ + ONES_INT16_AVX); \ + ymm_lhs_1 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), \ + ONES_INT16_AVX); \ + ymm_sum = \ + _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum); \ + } + +#if defined(__SSE2__) +static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { +#ifdef __SSE3__ + __m128i x1 = _mm_hadd_epi32(v, v); + __m128i x2 = _mm_hadd_epi32(x1, x1); + return _mm_cvtsi128_si32(x2); +#else + __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); + __m128i x2 = _mm_add_epi32(v, x1); + __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); + __m128i x4 = _mm_add_epi32(x2, x3); + return _mm_cvtsi128_si32(x4); +#endif +} +#endif // __SSE2__ + +//! Compute the distance between matrix and query +static __attribute__((always_inline)) void ip_int4_avx2(const void *a, + const void *b, + size_t size, + float *distance) { + const uint8_t *lhs = reinterpret_cast(a); + const uint8_t *rhs = reinterpret_cast(b); + const uint8_t *last = lhs + size; + const uint8_t *last_aligned = lhs + ((size >> 4) << 4); + __m128i xmm_sum = _mm_setzero_si128(); + + if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } + float result = static_cast(HorizontalAdd_INT32_V128(xmm_sum)); + + switch (last - lhs) { + case 15: + FMA_INT4_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT4_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT4_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT4_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT4_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT4_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT4_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT4_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT4_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT4_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT4_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT4_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT4_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT4_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT4_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +// Compute raw integer inner products for a batch of int8 vectors against a +// single query. Uses AVX512-VNNI dpbusd instruction. +// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. +template +__attribute__((always_inline)) void ip_int4_batch_avx2_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) {} + +static __attribute__((always_inline)) void ip_int4_batch_avx2( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + ip_int4_batch_avx2_impl(query, &vectors[i], prefetch_ptrs, dim, + distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + ip_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, dim, + distances + i); + } +} + +} // namespace zvec::turbo::avx2::internal + +#endif // defined(__AVX2__) diff --git a/tests/core/metric/quantized_integer_metric_test.cc b/tests/core/metric/quantized_integer_metric_test.cc index 501d8c7b9..f56d6ef67 100644 --- a/tests/core/metric/quantized_integer_metric_test.cc +++ b/tests/core/metric/quantized_integer_metric_test.cc @@ -32,8 +32,7 @@ using namespace zvec::ailego; static IndexHolder::Pointer GetHolder( size_t dim, size_t count, std::uniform_real_distribution &dist) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); auto holder = std::make_shared>(dim); for (size_t i = 0; i < count; ++i) { ailego::NumericalVector vec(dim); @@ -71,8 +70,7 @@ TEST(QuantizedIntegerMetric, General) { Params params; - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 1.0); const size_t DIMENSION = 21; ailego::NumericalVector x(DIMENSION); @@ -141,8 +139,7 @@ TEST(QuantizedIntegerMetric, General) { } TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); @@ -202,8 +199,7 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { } TEST(QuantizedIntegerMetric, TestInt8SquaredEuclideanReformer) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); std::uniform_int_distribution dist2(0, 1); @@ -251,7 +247,7 @@ void TestDistanceMatrixInt8(const std::string &metric_name) { const size_t batch_size = M; const size_t query_size = N; - size_t dimension = (std::uniform_int_distribution(1, 65))(gen)*4; + size_t dimension = (std::uniform_int_distribution(1, 65))(gen) * 4; auto holder = GetHolder(dimension, batch_size, dist); IndexMeta meta(IndexMeta::DT_FP32, dimension); meta.set_metric(metric_name, 0, Params()); @@ -344,8 +340,7 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclideanMetric) { } TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; @@ -404,8 +399,7 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { } TEST(QuantizedIntegerMetric, TestInt4SquaredEuclideanReformer) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); std::uniform_int_distribution dist2(0, 1); @@ -453,7 +447,7 @@ void TestDistanceMatrixInt4(const std::string &metric_name) { const size_t batch_size = M; const size_t query_size = N; - size_t dimension = (std::uniform_int_distribution(1, 65))(gen)*8; + size_t dimension = (std::uniform_int_distribution(1, 65))(gen) * 8; auto holder = GetHolder(dimension, batch_size, dist); IndexMeta meta(IndexMeta::DT_FP32, dimension); meta.set_metric(metric_name, 0, Params()); @@ -546,8 +540,7 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclideanMetric) { } TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); @@ -631,8 +624,7 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProductMetric) { } TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; @@ -716,8 +708,7 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProductMetric) { } TEST(QuantizedIntegerMetric, TestInt8MipsSquaredEuclidean) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); @@ -805,8 +796,7 @@ TEST(QuantizedIntegerMetric, TestInt8MipsSquaredEuclideanMetric) { } TEST(QuantizedIntegerMetric, TestInt4MipsSquaredEuclidean) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; @@ -890,8 +880,7 @@ TEST(QuantizedIntegerMetric, TestInt4MipsSquaredEuclideanMetric) { } TEST(QuantizedIntegerMetric, TestInt8NormalizedCosine) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); @@ -990,8 +979,7 @@ TEST(QuantizedIntegerMetric, TestInt8NormalizedCosineMetric) { } TEST(QuantizedIntegerMetric, TestInt8Cosine) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); @@ -1071,8 +1059,7 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) { } TEST(QuantizedIntegerMetric, TestInt4NormalizedCosine) { - std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; From 12395f6ad3574ae34c9cab3ea832f177062ec3b5 Mon Sep 17 00:00:00 2001 From: ray Date: Wed, 25 Mar 2026 15:50:46 +0800 Subject: [PATCH 04/44] refactor: add avx2 int4 l2 --- src/core/metric/quantized_integer_metric.cc | 7 ++++ .../avx2/record_quantized_int4/cosine.cc | 2 +- .../record_quantized_int4/inner_product.cc | 36 +------------------ .../inner_product_common.h | 6 ++-- .../squared_euclidean.cc | 31 +++++++++++++++- .../squared_euclidean_common.h | 6 ++-- src/turbo/turbo.cc | 9 +++++ 7 files changed, 52 insertions(+), 45 deletions(-) diff --git a/src/core/metric/quantized_integer_metric.cc b/src/core/metric/quantized_integer_metric.cc index 8562a3c94..a6bb10fc2 100644 --- a/src/core/metric/quantized_integer_metric.cc +++ b/src/core/metric/quantized_integer_metric.cc @@ -105,6 +105,13 @@ class QuantizedIntegerMetric : public IndexMetric { return DistanceMatrixCompute(m, n); } if (meta_.data_type() == IndexMeta::DataType::DT_INT4) { + auto turbo_ret = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault); + if (turbo_ret && m == 1 && n == 1) { + return turbo_ret; + } + return DistanceMatrixCompute(m, n); } break; diff --git a/src/turbo/avx2/record_quantized_int4/cosine.cc b/src/turbo/avx2/record_quantized_int4/cosine.cc index 7a15876d1..a9e32258c 100644 --- a/src/turbo/avx2/record_quantized_int4/cosine.cc +++ b/src/turbo/avx2/record_quantized_int4/cosine.cc @@ -28,7 +28,7 @@ void cosine_int4_distance(const void *a, const void *b, size_t dim, return; } - internal::ip_int4_avx2(a, b, original_dim, distance); + internal::inner_product_int4_avx2(a, b, original_dim, distance); const float *a_tail = reinterpret_cast( reinterpret_cast(a) + original_dim); diff --git a/src/turbo/avx2/record_quantized_int4/inner_product.cc b/src/turbo/avx2/record_quantized_int4/inner_product.cc index fdb25f9a5..5d98e995c 100644 --- a/src/turbo/avx2/record_quantized_int4/inner_product.cc +++ b/src/turbo/avx2/record_quantized_int4/inner_product.cc @@ -33,7 +33,7 @@ void inner_product_int4_distance(const void *a, const void *b, size_t dim, return; } - internal::ip_int4_avx2(a, b, original_dim, distance); + internal::inner_product_int4_avx2(a, b, original_dim, distance); const float *a_tail = reinterpret_cast( reinterpret_cast(a) + original_dim); @@ -50,7 +50,6 @@ void inner_product_int4_distance(const void *a, const void *b, size_t dim, *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + d * qb * mb); - #else (void)a; (void)b; @@ -64,40 +63,7 @@ void inner_product_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX2__) - const int original_dim = dim - 24; - if (original_dim <= 0) { - return; - } - internal::ip_int4_batch_avx2(vectors, query, n, original_dim, distances); - - const float *q_tail = reinterpret_cast( - reinterpret_cast(query) + original_dim); - float qa = q_tail[0]; - float qb = q_tail[1]; - float qs = q_tail[2]; - - for (int i = 0; i < n; ++i) { - const float *m_tail = reinterpret_cast( - reinterpret_cast(vectors[i]) + original_dim); - float ma = m_tail[0]; - float mb = m_tail[1]; - float ms = m_tail[2]; - // Correct for the +128 shift applied to the query during preprocessing: - // dpbusd computes sum(uint8_query[i] * int8_data[i]) - // = sum((int8_query[i] + 128) * int8_data[i]) - // = true_ip + 128 * sum(int8_data[i]) - // int8_sum is stored as the 5th int-sized field after the 4 floats. - int int8_sum = reinterpret_cast(m_tail)[4]; - float &result = distances[i]; - result -= 128.0f * static_cast(int8_sum); - - // Dequantize and compute cosine distance: - // cosine_dist = -(ma * qa * ip + mb * qa * qs + qb * ma * ms - // + original_dim * qb * mb) - result = -(ma * qa * result + mb * qa * qs + qb * ma * ms + - static_cast(original_dim) * qb * mb); - } #else (void)vectors; (void)query; diff --git a/src/turbo/avx2/record_quantized_int4/inner_product_common.h b/src/turbo/avx2/record_quantized_int4/inner_product_common.h index bec7f61b2..006fa05e7 100644 --- a/src/turbo/avx2/record_quantized_int4/inner_product_common.h +++ b/src/turbo/avx2/record_quantized_int4/inner_product_common.h @@ -146,10 +146,8 @@ static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { #endif // __SSE2__ //! Compute the distance between matrix and query -static __attribute__((always_inline)) void ip_int4_avx2(const void *a, - const void *b, - size_t size, - float *distance) { +static __attribute__((always_inline)) void inner_product_int4_avx2( + const void *a, const void *b, size_t size, float *distance) { const uint8_t *lhs = reinterpret_cast(a); const uint8_t *rhs = reinterpret_cast(b); const uint8_t *last = lhs + size; diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc index 1454955c9..60600ef4d 100644 --- a/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc +++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "avx2/record_quantized_int4/squared_euclidean.h" -#include "avx2/record_quantized_int4/squared_euclidean_common.h" +#include "avx2/record_quantized_int4/inner_product_common.h" #if defined(__AVX2__) #include @@ -24,6 +24,35 @@ namespace zvec::turbo::avx2 { void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX2__) + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + float qs2 = a_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + float ms2 = b_tail[3]; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); #else (void)a; (void)b; diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h b/src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h index bec7f61b2..82b860b4f 100644 --- a/src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h +++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h @@ -146,10 +146,8 @@ static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { #endif // __SSE2__ //! Compute the distance between matrix and query -static __attribute__((always_inline)) void ip_int4_avx2(const void *a, - const void *b, - size_t size, - float *distance) { +static __attribute__((always_inline)) void squared_euclidean_int4_avx2( + const void *a, const void *b, size_t size, float *distance) { const uint8_t *lhs = reinterpret_cast(a); const uint8_t *rhs = reinterpret_cast(b); const uint8_t *last = lhs + size; diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index 5f3c3cb07..8b59b6b74 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -34,6 +34,15 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, return avx512_vnni::cosine_int8_distance; } } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + // if (metric_type == MetricType::kSquaredEuclidean) { + // return avx2::squared_euclidean_int8_distance; + // } + // if (metric_type == MetricType::kCosine) { + // return avx2::cosine_int8_distance; + // } + } } } if (data_type == DataType::kInt4) { From 1ed3209fb474e5c279161e1ae62b96ec2f26fd05 Mon Sep 17 00:00:00 2001 From: ray Date: Thu, 26 Mar 2026 17:20:46 +0800 Subject: [PATCH 05/44] refactor: add dist funcs --- src/core/metric/quantized_integer_metric.cc | 6 ++ src/include/zvec/turbo/turbo.h | 24 +++-- .../avx2/record_quantized_int4/cosine.cc | 3 +- .../inner_product_common.h | 12 +-- .../squared_euclidean.cc | 33 +++++++ .../avx2/record_quantized_int8/cosine.cc | 48 +++++++++ src/turbo/avx2/record_quantized_int8/cosine.h | 30 ++++++ .../record_quantized_int8/inner_product.cc | 53 ++++++++++ .../record_quantized_int8/inner_product.h | 31 ++++++ .../inner_product_common.h | 69 +++++++++++++ .../squared_euclidean.cc | 50 ++++++++++ .../record_quantized_int8/squared_euclidean.h | 31 ++++++ .../squared_euclidean_common.h | 12 +-- src/turbo/sse/record_quantized_int4/common.h | 43 -------- src/turbo/sse/record_quantized_int4/cosine.cc | 15 +-- src/turbo/sse/record_quantized_int4/cosine.h | 8 +- .../record_quantized_int4/inner_product.cc | 75 ++------------ .../sse/record_quantized_int4/inner_product.h | 3 +- .../squared_euclidean.cc | 37 +++++++ .../record_quantized_int4/squared_euclidean.h | 16 +++ src/turbo/sse/record_quantized_int8/cosine.cc | 36 +++++++ src/turbo/sse/record_quantized_int8/cosine.h | 5 - .../record_quantized_int8/inner_product.cc | 40 ++++++++ .../sse/record_quantized_int8/inner_product.h | 16 +++ .../squared_euclidean.cc | 99 ++----------------- src/turbo/turbo.cc | 92 ++++++++++++++--- 26 files changed, 625 insertions(+), 262 deletions(-) create mode 100644 src/turbo/avx2/record_quantized_int8/cosine.cc create mode 100644 src/turbo/avx2/record_quantized_int8/cosine.h create mode 100644 src/turbo/avx2/record_quantized_int8/inner_product.cc create mode 100644 src/turbo/avx2/record_quantized_int8/inner_product.h create mode 100644 src/turbo/avx2/record_quantized_int8/inner_product_common.h create mode 100644 src/turbo/avx2/record_quantized_int8/squared_euclidean.cc create mode 100644 src/turbo/avx2/record_quantized_int8/squared_euclidean.h rename src/turbo/avx2/{record_quantized_int4 => record_quantized_int8}/squared_euclidean_common.h (96%) delete mode 100644 src/turbo/sse/record_quantized_int4/common.h diff --git a/src/core/metric/quantized_integer_metric.cc b/src/core/metric/quantized_integer_metric.cc index a6bb10fc2..b0fc95995 100644 --- a/src/core/metric/quantized_integer_metric.cc +++ b/src/core/metric/quantized_integer_metric.cc @@ -118,6 +118,12 @@ class QuantizedIntegerMetric : public IndexMetric { case MetricType::kInnerProduct: if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { + auto turbo_ret = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault); + if (turbo_ret && m == 1 && n == 1) { + return turbo_ret; + } return DistanceMatrixCompute(m, n); } diff --git a/src/include/zvec/turbo/turbo.h b/src/include/zvec/turbo/turbo.h index f6054c7a8..098067428 100644 --- a/src/include/zvec/turbo/turbo.h +++ b/src/include/zvec/turbo/turbo.h @@ -43,15 +43,25 @@ enum class QuantizeType { kDefault, }; +enum class CpuArchType { + kAuto, + kSSE, + kAVX2, + kAVX512, + kAVX512VNNI, + kAVX512FP16 +}; + DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, - QuantizeType quantize_type); + QuantizeType quantize_type, + CpuArchType cpu_arch_type = CpuArchType::kAuto); -BatchDistanceFunc get_batch_distance_func(MetricType metric_type, - DataType data_type, - QuantizeType quantize_type); +BatchDistanceFunc get_batch_distance_func( + MetricType metric_type, DataType data_type, QuantizeType quantize_type, + CpuArchType cpu_arch_type = CpuArchType::kAuto); -QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type, - DataType data_type, - QuantizeType quantize_type); +QueryPreprocessFunc get_query_preprocess_func( + MetricType metric_type, DataType data_type, QuantizeType quantize_type, + CpuArchType cpu_arch_type = CpuArchType::kAuto); } // namespace zvec::turbo diff --git a/src/turbo/avx2/record_quantized_int4/cosine.cc b/src/turbo/avx2/record_quantized_int4/cosine.cc index a9e32258c..f83c7358c 100644 --- a/src/turbo/avx2/record_quantized_int4/cosine.cc +++ b/src/turbo/avx2/record_quantized_int4/cosine.cc @@ -65,7 +65,8 @@ void cosine_int4_batch_distance(const void *const *vectors, const void *query, return; } - internal::ip_int4_batch_avx2(vectors, query, n, original_dim, distances); + internal::inner_product_int4_batch_avx2(vectors, query, n, original_dim, + distances); const float *q_tail = reinterpret_cast( reinterpret_cast(query) + original_dim); diff --git a/src/turbo/avx2/record_quantized_int4/inner_product_common.h b/src/turbo/avx2/record_quantized_int4/inner_product_common.h index 006fa05e7..6d12504e3 100644 --- a/src/turbo/avx2/record_quantized_int4/inner_product_common.h +++ b/src/turbo/avx2/record_quantized_int4/inner_product_common.h @@ -223,12 +223,12 @@ static __attribute__((always_inline)) void inner_product_int4_avx2( // single query. Uses AVX512-VNNI dpbusd instruction. // `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. template -__attribute__((always_inline)) void ip_int4_batch_avx2_impl( +__attribute__((always_inline)) void inner_product_int4_batch_avx2_impl( const void *query, const void *const *vectors, const std::array &prefetch_ptrs, size_t dimensionality, float *distances) {} -static __attribute__((always_inline)) void ip_int4_batch_avx2( +static __attribute__((always_inline)) void inner_product_int4_batch_avx2( const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { static constexpr size_t batch_size = 2; @@ -243,13 +243,13 @@ static __attribute__((always_inline)) void ip_int4_batch_avx2( prefetch_ptrs[j] = nullptr; } } - ip_int4_batch_avx2_impl(query, &vectors[i], prefetch_ptrs, dim, - distances + i); + inner_product_int4_batch_avx2_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); } for (; i < n; i++) { std::array prefetch_ptrs{nullptr}; - ip_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, dim, - distances + i); + inner_product_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); } } diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc index 60600ef4d..1599a722d 100644 --- a/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc +++ b/src/turbo/avx2/record_quantized_int4/squared_euclidean.cc @@ -65,7 +65,40 @@ void squared_euclidean_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX2__) + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_batch_avx2(vectors, query, n, original_dim, + distances); + + const float *q_tail = reinterpret_cast( + reinterpret_cast(query) + original_dim); + float qa = q_tail[0]; + float qb = q_tail[1]; + float qs = q_tail[2]; + float qs2 = q_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + for (int i = 0; i < n; ++i) { + const float *m_tail = reinterpret_cast( + reinterpret_cast(vectors[i]) + original_dim); + + float ma = m_tail[0]; + float mb = m_tail[1]; + float ms = m_tail[2]; + float ms2 = m_tail[3]; + + float &result = distances[i]; + result = ma * ma * ms2 + sum2 - 2 * ma * qa * result + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx2/record_quantized_int8/cosine.cc b/src/turbo/avx2/record_quantized_int8/cosine.cc new file mode 100644 index 000000000..5486a52a6 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/cosine.cc @@ -0,0 +1,48 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int8/cosine.h" +#include "avx2/record_quantized_int8/inner_product_common.h" +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX2__ +} + +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX2__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int8/cosine.h b/src/turbo/avx2/record_quantized_int8/cosine.h new file mode 100644 index 000000000..6074ea428 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized int8 vector pair. +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int8_distance. +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int8/inner_product.cc b/src/turbo/avx2/record_quantized_int8/inner_product.cc new file mode 100644 index 000000000..19fe96c7d --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/inner_product.cc @@ -0,0 +1,53 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int8/inner_product.h" +#include "avx2/record_quantized_int8/inner_product_common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +// Compute squared Euclidean distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif //__AVX2__ +} + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX2__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int8/inner_product.h b/src/turbo/avx2/record_quantized_int8/inner_product.h new file mode 100644 index 000000000..249bafd00 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute inner product distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx2 diff --git a/src/turbo/avx2/record_quantized_int8/inner_product_common.h b/src/turbo/avx2/record_quantized_int8/inner_product_common.h new file mode 100644 index 000000000..2c099ad13 --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/inner_product_common.h @@ -0,0 +1,69 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX2__) +#include +#include +#include +#include + +namespace zvec::turbo::avx2::internal { + +// Compute raw integer inner products for a batch of int8 vectors against a +// single query. Uses AVX512-VNNI dpbusd instruction. +// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. +template +__attribute__((always_inline)) void inner_product_int8_batch_avx2_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) {} + +static __attribute__((always_inline)) void inner_product_int8_batch_avx2( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_int8_batch_avx2_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_int8_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); + } +} + +} // namespace zvec::turbo::avx2::internal + +#endif // defined(__AVX2__) diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc b/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc new file mode 100644 index 000000000..2d493602b --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc @@ -0,0 +1,50 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int8/squared_euclidean.h" +#include "avx2/record_quantized_int8/inner_product_common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX2__ +} + +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX2__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean.h b/src/turbo/avx2/record_quantized_int8/squared_euclidean.h new file mode 100644 index 000000000..40d8a1baf --- /dev/null +++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute squared euclidean distance between a single quantized INT8 +// vector pair. +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean INT4. +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx2 diff --git a/src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h b/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h similarity index 96% rename from src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h rename to src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h index 82b860b4f..b352108ed 100644 --- a/src/turbo/avx2/record_quantized_int4/squared_euclidean_common.h +++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h @@ -223,12 +223,12 @@ static __attribute__((always_inline)) void squared_euclidean_int4_avx2( // single query. Uses AVX512-VNNI dpbusd instruction. // `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. template -__attribute__((always_inline)) void ip_int4_batch_avx2_impl( +__attribute__((always_inline)) void inner_product_int4_batch_avx2_impl( const void *query, const void *const *vectors, const std::array &prefetch_ptrs, size_t dimensionality, float *distances) {} -static __attribute__((always_inline)) void ip_int4_batch_avx2( +static __attribute__((always_inline)) void inner_product_int4_batch_avx2( const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { static constexpr size_t batch_size = 2; @@ -243,13 +243,13 @@ static __attribute__((always_inline)) void ip_int4_batch_avx2( prefetch_ptrs[j] = nullptr; } } - ip_int4_batch_avx2_impl(query, &vectors[i], prefetch_ptrs, dim, - distances + i); + inner_product_int4_batch_avx2_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); } for (; i < n; i++) { std::array prefetch_ptrs{nullptr}; - ip_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, dim, - distances + i); + inner_product_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); } } diff --git a/src/turbo/sse/record_quantized_int4/common.h b/src/turbo/sse/record_quantized_int4/common.h deleted file mode 100644 index c47294eb6..000000000 --- a/src/turbo/sse/record_quantized_int4/common.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025-present the zvec project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - -#pragma once - -#if defined(__SSE4_1__) -#include -#include -#include - -namespace zvec::turbo::sse::internal { - -static __attribute__((always_inline)) void ip_int4_sse(const void *a, - const void *b, - size_t size, - float *distance) {} - -static __attribute__((always_inline)) void ip_int4_batch_sse( - const void *const *vectors, const void *query, size_t n, size_t dim, - float *distances) {} - -} // namespace zvec::turbo::sse::internal - -#endif // defined(__SSE4_1__) diff --git a/src/turbo/sse/record_quantized_int4/cosine.cc b/src/turbo/sse/record_quantized_int4/cosine.cc index f041bfe80..1b955d983 100644 --- a/src/turbo/sse/record_quantized_int4/cosine.cc +++ b/src/turbo/sse/record_quantized_int4/cosine.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "sse/record_quantized_int4/cosine.h" -#include "sse/record_quantized_int4/common.h" -#if defined(__SSE4_1__) +#include "sse/record_quantized_int4/inner_product_common.h" +#if defined(__SSE__) #include #endif @@ -22,12 +22,7 @@ namespace zvec::turbo::sse { void cosine_int4_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__SSE4_1__) - // `dim` is the full encoded size; the original vector occupies dim-24 bytes. - const int original_dim = dim - 24; - if (original_dim <= 0) { - return; - } +#if defined(__SSE__) #else (void)a; @@ -39,7 +34,7 @@ void cosine_int4_distance(const void *a, const void *b, size_t dim, void cosine_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__SSE4_1__) +#if defined(__SSE__) #else (void)vectors; @@ -47,7 +42,7 @@ void cosine_int4_batch_distance(const void *const *vectors, const void *query, (void)n; (void)dim; (void)distances; -#endif //__SSE4_1__ +#endif //__SSE__ } } // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/cosine.h b/src/turbo/sse/record_quantized_int4/cosine.h index bab173eca..87306a06e 100644 --- a/src/turbo/sse/record_quantized_int4/cosine.h +++ b/src/turbo/sse/record_quantized_int4/cosine.h @@ -19,15 +19,11 @@ namespace zvec::turbo::sse { // Compute cosine distance (negative inner product after normalization) between -// a single quantized INT8 vector pair. -// `dim` includes the original vector bytes plus a 24-byte metadata tail -// (3 floats: scale_a, bias_a, sum_a). +// a single quantized INT4 vector pair. void cosine_int4_distance(const void *a, const void *b, size_t dim, float *distance); -// Batch version of cosine_int8_distance. -// The query must have been preprocessed by cosine_int8_query_preprocess -// (int8 -> uint8 via + 128 shift) before calling this function. +// Batch version of cosine_int4_distance. void cosine_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); diff --git a/src/turbo/sse/record_quantized_int4/inner_product.cc b/src/turbo/sse/record_quantized_int4/inner_product.cc index e8ef5df7c..33a889f5f 100644 --- a/src/turbo/sse/record_quantized_int4/inner_product.cc +++ b/src/turbo/sse/record_quantized_int4/inner_product.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "sse/record_quantized_int4/inner_product.h" -#include "sse/record_quantized_int4/common.h" +#include "sse/record_quantized_int4/inner_product_common.h" -#if defined(__SSE4_1__) +#if defined(__SSE__) #include #endif @@ -25,92 +25,29 @@ namespace zvec::turbo::sse { // vector pair. void inner_product_int4_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__SSE4_1__) - // `dim` is the full encoded size; the original vector occupies dim-24 bytes. - const int d = dim - 32; - const size_t original_dim = d >> 1; - - if (original_dim <= 0) { - return; - } - - internal::ip_int4_sse(a, b, original_dim, distance); - - const float *a_tail = reinterpret_cast( - reinterpret_cast(a) + original_dim); - const float *b_tail = reinterpret_cast( - reinterpret_cast(b) + original_dim); - - float qa = a_tail[0]; - float qb = a_tail[1]; - float qs = a_tail[2]; - float qs2 = a_tail[3]; - const float sum = qa * qs; - const float sum2 = qa * qa * qs2; - - float ma = b_tail[0]; - float mb = b_tail[1]; - float ms = b_tail[2]; - float ms2 = b_tail[3]; - - *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + - (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); +#if defined(__SSE__) #else (void)a; (void)b; (void)dim; (void)distance; -#endif +#endif //__SSE__ } // Batch version of inner_product_int4_distance. void inner_product_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__SSE4_1__) - // `dim` is the full encoded size; the original vector occupies dim-24 bytes. - const int original_dim = dim - 24; - if (original_dim <= 0) { - return; - } - - internal::ip_int4_batch_sse(vectors, query, n, original_dim, distances); - - const float *q_tail = reinterpret_cast( - reinterpret_cast(query) + original_dim); - float qa = q_tail[0]; - float qb = q_tail[1]; - float qs = q_tail[2]; - - for (int i = 0; i < n; ++i) { - const float *m_tail = reinterpret_cast( - reinterpret_cast(vectors[i]) + original_dim); - float ma = m_tail[0]; - float mb = m_tail[1]; - float ms = m_tail[2]; - // Correct for the +128 shift applied to the query during preprocessing: - // dpbusd computes sum(uint8_query[i] * int8_data[i]) - // = sum((int8_query[i] + 128) * int8_data[i]) - // = true_ip + 128 * sum(int8_data[i]) - // int8_sum is stored as the 5th int-sized field after the 4 floats. - int int8_sum = reinterpret_cast(m_tail)[4]; - float &result = distances[i]; - result -= 128.0f * static_cast(int8_sum); +#if defined(__SSE__) - // Dequantize and compute cosine distance: - // cosine_dist = -(ma * qa * ip + mb * qa * qs + qb * ma * ms - // + original_dim * qb * mb) - result = -(ma * qa * result + mb * qa * qs + qb * ma * ms + - static_cast(original_dim) * qb * mb); - } #else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; -#endif // __SSE4_1__ +#endif //__SSE__ } } // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/inner_product.h b/src/turbo/sse/record_quantized_int4/inner_product.h index 8a6ee015c..4ee508ed2 100644 --- a/src/turbo/sse/record_quantized_int4/inner_product.h +++ b/src/turbo/sse/record_quantized_int4/inner_product.h @@ -14,12 +14,11 @@ #pragma once - #include namespace zvec::turbo::sse { -// Compute squared Euclidean distance between a single quantized INT4 +// Compute inner product distance between a single quantized INT4 // vector pair. void inner_product_int4_distance(const void *a, const void *b, size_t dim, float *distance); diff --git a/src/turbo/sse/record_quantized_int4/squared_euclidean.cc b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc index 22447509b..0b4d34cd9 100644 --- a/src/turbo/sse/record_quantized_int4/squared_euclidean.cc +++ b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc @@ -11,3 +11,40 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +#include "sse/record_quantized_int4/squared_euclidean.h" +#include "sse/record_quantized_int4/inner_product_common.h" + +#if defined(__SSE__) +#include +#endif + +namespace zvec::turbo::sse { + +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __SSE__ +} + +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__SSE__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__SSE__ +} + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/squared_euclidean.h b/src/turbo/sse/record_quantized_int4/squared_euclidean.h index a0b74ecbf..3cff9f99b 100644 --- a/src/turbo/sse/record_quantized_int4/squared_euclidean.h +++ b/src/turbo/sse/record_quantized_int4/squared_euclidean.h @@ -13,3 +13,19 @@ // limitations under the License. #pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute squared euclidean distance between a single quantized INT4 +// vector pair. +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean INT4. +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::sse diff --git a/src/turbo/sse/record_quantized_int8/cosine.cc b/src/turbo/sse/record_quantized_int8/cosine.cc index 22447509b..dabff9f71 100644 --- a/src/turbo/sse/record_quantized_int8/cosine.cc +++ b/src/turbo/sse/record_quantized_int8/cosine.cc @@ -11,3 +11,39 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +#include "sse/record_quantized_int8/cosine.h" +#include "sse/record_quantized_int8/common.h" + +#if defined(__SSE__) +#include +#endif + +namespace zvec::turbo::sse { + +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __SSE__ +} + +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__SSE__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__SSE__ +} + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/cosine.h b/src/turbo/sse/record_quantized_int8/cosine.h index 5fb491eab..e0ac7f556 100644 --- a/src/turbo/sse/record_quantized_int8/cosine.h +++ b/src/turbo/sse/record_quantized_int8/cosine.h @@ -31,9 +31,4 @@ void cosine_int8_distance(const void *a, const void *b, size_t dim, void cosine_int8_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); -// Preprocess the query vector in-place (shift int8 -> uint8 by adding 128) -// so that the AVX512-VNNI dpbusd instruction can be used for inner product. -// `dim` includes the 24-byte metadata tail. -void cosine_int8_query_preprocess(void *query, size_t dim); - } // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/inner_product.cc b/src/turbo/sse/record_quantized_int8/inner_product.cc index 22447509b..7c1bea677 100644 --- a/src/turbo/sse/record_quantized_int8/inner_product.cc +++ b/src/turbo/sse/record_quantized_int8/inner_product.cc @@ -11,3 +11,43 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +#include "sse/record_quantized_int8/inner_product.h" +#include "sse/record_quantized_int8/common.h" + +#if defined(__SSE__) +#include +#endif + +namespace zvec::turbo::sse { + +// Compute squared Euclidean distance between a single quantized INT4 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__SSE__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif //__SSE__ +} + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__SSE__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__SSE__ +} + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/inner_product.h b/src/turbo/sse/record_quantized_int8/inner_product.h index a0b74ecbf..9c6314b35 100644 --- a/src/turbo/sse/record_quantized_int8/inner_product.h +++ b/src/turbo/sse/record_quantized_int8/inner_product.h @@ -13,3 +13,19 @@ // limitations under the License. #pragma once + +#include + +namespace zvec::turbo::sse { + +// Compute inner product distance between a single quantized INT4 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int4_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/squared_euclidean.cc b/src/turbo/sse/record_quantized_int8/squared_euclidean.cc index b9b8f23ef..d51ee0cf6 100644 --- a/src/turbo/sse/record_quantized_int8/squared_euclidean.cc +++ b/src/turbo/sse/record_quantized_int8/squared_euclidean.cc @@ -12,56 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx512_vnni/record_quantized_int8/squared_euclidean.h" -#include "avx512_vnni/record_quantized_int8/common.h" -#if defined(__AVX512VNNI__) +#include "sse/record_quantized_int8/squared_euclidean.h" +#include "sse/record_quantized_int8/common.h" +#if defined(__SSE__) #include #endif -// Tail layout for quantized INT8 squared Euclidean vectors: -// -// [ original_dim bytes: int8_t elements ] -// [ float scale_a ] (ma) -// [ float bias_a ] (mb) -// [ float sum_a ] (ms) -// [ float sum2_a ] (ms2) -// [ int int8_sum ] (sum of raw int8 elements, used for bias correction -// when the query has been shifted to uint8 via +128) -// -// Total tail size: 4 floats + 1 int = 20 bytes, so dim = original_dim + 20. - -namespace zvec::turbo::avx512_vnni { +namespace zvec::turbo::sse { void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__AVX512VNNI__) - const int original_dim = dim - 20; - if (original_dim <= 0) { - return; - } - internal::ip_int8_avx512_vnni(a, b, original_dim, distance); - - const float *a_tail = reinterpret_cast( - reinterpret_cast(a) + original_dim); - const float *b_tail = reinterpret_cast( - reinterpret_cast(b) + original_dim); - - float ma = a_tail[0]; - float mb = a_tail[1]; - float ms = a_tail[2]; - float ms2 = a_tail[3]; - - float qa = b_tail[0]; - float qb = b_tail[1]; - float qs = b_tail[2]; - float qs2 = b_tail[3]; - - const float sum = qa * qs; - const float sum2 = qa * qa * qs2; +#if defined(__SSE__) - *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + - (mb - qb) * (mb - qb) * original_dim + - 2 * (mb - qb) * (ms * ma - sum); #else (void)a; (void)b; @@ -73,42 +35,8 @@ void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, void squared_euclidean_int8_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__AVX512VNNI__) - const int original_dim = dim - 20; - if (original_dim <= 0) { - return; - } +#if defined(__SSE__) - internal::ip_int8_batch_avx512_vnni(vectors, query, n, original_dim, - distances); - const float *q_tail = reinterpret_cast( - reinterpret_cast(query) + original_dim); - float qa = q_tail[0]; - float qb = q_tail[1]; - float qs = q_tail[2]; - float qs2 = q_tail[3]; - - const float sum = qa * qs; - const float sum2 = qa * qa * qs2; - for (size_t i = 0; i < n; ++i) { - const float *m_tail = reinterpret_cast( - reinterpret_cast(vectors[i]) + original_dim); - float ma = m_tail[0]; - float mb = m_tail[1]; - float ms = m_tail[2]; - float ms2 = m_tail[3]; - // Correct for the +128 shift applied to the query during preprocessing: - // dpbusd computes sum(uint8_query[i] * int8_data[i]) - // = sum((int8_query[i] + 128) * int8_data[i]) - // = true_ip + 128 * sum(int8_data[i]) - // int8_sum is stored as the 5th int-sized field after the 4 floats. - int int8_sum = reinterpret_cast(m_tail)[4]; - float &result = distances[i]; - result -= 128.0f * static_cast(int8_sum); - result = ma * ma * ms2 + sum2 - 2 * ma * qa * result + - (mb - qb) * (mb - qb) * original_dim + - 2 * (mb - qb) * (ms * ma - sum); - } #else (void)vectors; (void)query; @@ -118,17 +46,4 @@ void squared_euclidean_int8_batch_distance(const void *const *vectors, #endif } -void squared_euclidean_int8_query_preprocess(void *query, size_t dim) { -#if defined(__AVX512VNNI__) - const int original_dim = static_cast(dim) - 20; - if (original_dim <= 0) { - return; - } - internal::shift_int8_to_uint8_avx512(query, original_dim); -#else - (void)query; - (void)dim; -#endif -} - -} // namespace zvec::turbo::avx512_vnni +} // namespace zvec::turbo::sse diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index 8b59b6b74..d135d2fe0 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -17,16 +17,29 @@ #include "avx2/record_quantized_int4/cosine.h" #include "avx2/record_quantized_int4/inner_product.h" #include "avx2/record_quantized_int4/squared_euclidean.h" +#include "avx2/record_quantized_int8/cosine.h" +#include "avx2/record_quantized_int8/inner_product.h" +#include "avx2/record_quantized_int8/squared_euclidean.h" #include "avx512_vnni/record_quantized_int8/cosine.h" #include "avx512_vnni/record_quantized_int8/squared_euclidean.h" +#include "sse/record_quantized_int4/cosine.h" +#include "sse/record_quantized_int4/inner_product.h" +#include "sse/record_quantized_int4/squared_euclidean.h" +#include "sse/record_quantized_int8/cosine.h" +#include "sse/record_quantized_int8/inner_product.h" +#include "sse/record_quantized_int8/squared_euclidean.h" namespace zvec::turbo { DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, - QuantizeType quantize_type) { + QuantizeType quantize_type, + CpuArchType cpu_arch_type) { + // INT8 if (data_type == DataType::kInt8) { if (quantize_type == QuantizeType::kDefault) { - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512VNNI)) { if (metric_type == MetricType::kSquaredEuclidean) { return avx512_vnni::squared_euclidean_int8_distance; } @@ -35,19 +48,44 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, } } - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { - // if (metric_type == MetricType::kSquaredEuclidean) { - // return avx2::squared_euclidean_int8_distance; - // } - // if (metric_type == MetricType::kCosine) { - // return avx2::cosine_int8_distance; - // } + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX2)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx2::squared_euclidean_int8_distance; + } + if (metric_type == MetricType::kCosine) { + return avx2::cosine_int8_distance; + } + + if (metric_type == MetricType::kInnerProduct) { + return avx2::inner_product_int8_distance; + } + } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kSSE)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return sse::squared_euclidean_int8_distance; + } + if (metric_type == MetricType::kCosine) { + return sse::cosine_int8_distance; + } + + if (metric_type == MetricType::kInnerProduct) { + return sse::inner_product_int8_distance; + } } } } + + // INT4 if (data_type == DataType::kInt4) { if (quantize_type == QuantizeType::kDefault) { - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX2)) { if (metric_type == MetricType::kSquaredEuclidean) { return avx2::squared_euclidean_int4_distance; } @@ -59,16 +97,35 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, } } } + + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kSSE)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return sse::squared_euclidean_int4_distance; + } + if (metric_type == MetricType::kCosine) { + return sse::cosine_int4_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return sse::inner_product_int4_distance; + } + } + } } return nullptr; } BatchDistanceFunc get_batch_distance_func(MetricType metric_type, DataType data_type, - QuantizeType quantize_type) { + QuantizeType quantize_type, + CpuArchType cpu_arch_type) { if (data_type == DataType::kInt8) { if (quantize_type == QuantizeType::kDefault) { - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512VNNI)) { if (metric_type == MetricType::kSquaredEuclidean) { return avx512_vnni::squared_euclidean_int8_batch_distance; } @@ -81,7 +138,9 @@ BatchDistanceFunc get_batch_distance_func(MetricType metric_type, if (data_type == DataType::kInt4) { if (quantize_type == QuantizeType::kDefault) { - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX2)) { if (metric_type == MetricType::kSquaredEuclidean) { return avx2::squared_euclidean_int4_batch_distance; } @@ -100,10 +159,13 @@ BatchDistanceFunc get_batch_distance_func(MetricType metric_type, QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type, DataType data_type, - QuantizeType quantize_type) { + QuantizeType quantize_type, + CpuArchType cpu_arch_type) { if (data_type == DataType::kInt8) { if (quantize_type == QuantizeType::kDefault) { - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512VNNI)) { if (metric_type == MetricType::kSquaredEuclidean) { return avx512_vnni::squared_euclidean_int8_query_preprocess; } From c6f37d240a340c1295f18f018fcb81e0ea72c49f Mon Sep 17 00:00:00 2001 From: ray Date: Thu, 26 Mar 2026 20:54:53 +0800 Subject: [PATCH 06/44] refactor: add ut for march --- .../inner_product_common.h | 258 ++++++++++++++++++ tests/turbo/quantized_integer_test.cc | 235 ++++++++++++++++ 2 files changed, 493 insertions(+) create mode 100644 src/turbo/sse/record_quantized_int4/inner_product_common.h create mode 100644 tests/turbo/quantized_integer_test.cc diff --git a/src/turbo/sse/record_quantized_int4/inner_product_common.h b/src/turbo/sse/record_quantized_int4/inner_product_common.h new file mode 100644 index 000000000..6d12504e3 --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/inner_product_common.h @@ -0,0 +1,258 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX2__) +#include +#include +#include +#include + +namespace zvec::turbo::avx2::internal { + + +/*! Four-bits Integer Multiplication Table + */ +static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, + 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, + 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, + 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, + 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, + 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, + 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, + 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, + 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, + 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, + 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, + 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, + 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, + 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, +}; + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT4_GENERAL(m, q, sum) \ + sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \ + Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)]; + +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f) +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) + +#define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0) +#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001) + +static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}; + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable) + +//! Compute the distance between matrix and query +#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) \ + { \ + __m128i xmm_lhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE)); \ + __m128i xmm_rhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE)); \ + __m128i xmm_lhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE)); \ + __m128i xmm_rhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE)); \ + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); \ + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); \ + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); \ + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); \ + xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), \ + ONES_INT16_SSE); \ + xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), \ + ONES_INT16_SSE); \ + xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \ + } + +#define FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) \ + { \ + __m256i ymm_lhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_lhs), MASK_INT4_AVX)); \ + __m256i ymm_rhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_rhs), MASK_INT4_AVX)); \ + __m256i ymm_lhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_lhs), 4), MASK_INT4_AVX)); \ + __m256i ymm_rhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_rhs), 4), MASK_INT4_AVX)); \ + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); \ + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); \ + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); \ + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); \ + ymm_lhs_0 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), \ + ONES_INT16_AVX); \ + ymm_lhs_1 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), \ + ONES_INT16_AVX); \ + ymm_sum = \ + _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum); \ + } + +#if defined(__SSE2__) +static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { +#ifdef __SSE3__ + __m128i x1 = _mm_hadd_epi32(v, v); + __m128i x2 = _mm_hadd_epi32(x1, x1); + return _mm_cvtsi128_si32(x2); +#else + __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); + __m128i x2 = _mm_add_epi32(v, x1); + __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); + __m128i x4 = _mm_add_epi32(x2, x3); + return _mm_cvtsi128_si32(x4); +#endif +} +#endif // __SSE2__ + +//! Compute the distance between matrix and query +static __attribute__((always_inline)) void inner_product_int4_avx2( + const void *a, const void *b, size_t size, float *distance) { + const uint8_t *lhs = reinterpret_cast(a); + const uint8_t *rhs = reinterpret_cast(b); + const uint8_t *last = lhs + size; + const uint8_t *last_aligned = lhs + ((size >> 4) << 4); + __m128i xmm_sum = _mm_setzero_si128(); + + if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } + float result = static_cast(HorizontalAdd_INT32_V128(xmm_sum)); + + switch (last - lhs) { + case 15: + FMA_INT4_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT4_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT4_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT4_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT4_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT4_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT4_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT4_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT4_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT4_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT4_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT4_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT4_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT4_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT4_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +// Compute raw integer inner products for a batch of int8 vectors against a +// single query. Uses AVX512-VNNI dpbusd instruction. +// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. +template +__attribute__((always_inline)) void inner_product_int4_batch_avx2_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) {} + +static __attribute__((always_inline)) void inner_product_int4_batch_avx2( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_int4_batch_avx2_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); + } +} + +} // namespace zvec::turbo::avx2::internal + +#endif // defined(__AVX2__) diff --git a/tests/turbo/quantized_integer_test.cc b/tests/turbo/quantized_integer_test.cc new file mode 100644 index 000000000..9a7ecac23 --- /dev/null +++ b/tests/turbo/quantized_integer_test.cc @@ -0,0 +1,235 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace zvec; +using namespace zvec::core; +using namespace zvec::ailego; + +TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1000; + + auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_float = ailego::Distance::MinusInnerProduct( + query_vec.data(), doc_vec.data(), DIMENSION); + + float score_avx2{0.0f}; + float score_sse{0.0f}; + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_avx2, score_sse, 0.001); + } +} + +#if 0 +TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1000; + IndexMeta meta; + meta.set_meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + + auto holder = GetHolder(DIMENSION, COUNT, dist); + ASSERT_EQ(0u, IndexConverter::TrainAndTransform(converter, holder)); + auto holder2 = converter->result(); + EXPECT_EQ(COUNT, holder2->count()); + EXPECT_EQ(IndexMeta::DT_INT4, holder2->data_type()); + auto &meta2 = converter->meta(); + + auto reformer = IndexFactory::CreateReformer(meta2.reformer_name()); + ASSERT_TRUE(reformer); + ASSERT_EQ(0u, reformer->init(meta2.reformer_params())); + + ailego::NumericalVector vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + vec[j] = dist(gen); + } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta2; + std::string out; + ASSERT_EQ(0, reformer->transform(vec.data(), qmeta, &out, &qmeta2)); + ASSERT_EQ(qmeta2.dimension(), meta2.dimension()); + + auto iter = holder->create_iterator(); + auto iter2 = holder2->create_iterator(); + auto metric = IndexFactory::CreateMetric(meta2.metric_name()); + ASSERT_TRUE(!!metric); + ASSERT_EQ(0, metric->init(meta2, meta2.metric_params())); + auto compute = metric->distance(); + ASSERT_TRUE(compute); + + for (; iter->is_valid(); iter->next(), iter2->next()) { + const float *mf = (const float *)iter->data(); + const int8_t *mi = (const int8_t *)iter2->data(); + const int8_t *qi = reinterpret_cast(&out[0]); + float v1 = ailego::Distance::MinusInnerProduct(mf, vec.data(), + holder->dimension()); + float v2; + compute(mi, qi, holder2->dimension(), &v2); + ASSERT_NEAR(v1, v2, 0.2 * DIMENSION); + + std::string out2; + ASSERT_EQ(0, reformer->convert(iter->data(), qmeta, &out2, &qmeta2)); + ASSERT_EQ(out2.size(), holder2->element_size()); + ASSERT_EQ(0, std::memcmp(out2.data(), iter2->data(), out2.size())); + } +} + +TEST(QuantizedIntegerMetric, TestInt8Cosine) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1000; + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + auto converter = IndexFactory::CreateConverter("CosineInt8Converter"); + ASSERT_TRUE(!!converter); + Params converter_params; + ASSERT_EQ(0u, converter->init(meta, converter_params)); + + auto holder = GetHolder(DIMENSION, COUNT, dist); + ASSERT_EQ(0u, IndexConverter::TrainAndTransform(converter, holder)); + auto holder2 = converter->result(); + EXPECT_EQ(COUNT, holder2->count()); + EXPECT_EQ(IndexMeta::DT_INT8, holder2->data_type()); + auto &meta2 = converter->meta(); + + auto reformer = IndexFactory::CreateReformer(meta2.reformer_name()); + ASSERT_TRUE(reformer); + ASSERT_EQ(0u, reformer->init(meta2.reformer_params())); + + ailego::NumericalVector vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + vec[j] = dist(gen); + } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta2; + std::string out; + ASSERT_EQ(0, reformer->transform(vec.data(), qmeta, &out, &qmeta2)); + ASSERT_EQ(qmeta2.dimension(), meta2.dimension()); + + auto iter = holder->create_iterator(); + auto iter2 = holder2->create_iterator(); + auto metric = IndexFactory::CreateMetric(meta2.metric_name()); + ASSERT_TRUE(!!metric); + ASSERT_EQ(0, metric->init(meta2, meta2.metric_params())); + auto compute_batch = metric->batch_distance(); + ASSERT_TRUE(compute_batch); + + int8_t *qi = reinterpret_cast(&out[0]); + if (auto query_preprocess_func = metric->get_query_preprocess_func(); + query_preprocess_func != nullptr) { + query_preprocess_func(qi, holder2->dimension()); + } + + for (; iter->is_valid(); iter->next(), iter2->next()) { + const float *mf = (const float *)iter->data(); + const int8_t *mi = (const int8_t *)iter2->data(); + + // normalize mf & vec + std::vector normalized_mf(DIMENSION); + memcpy(normalized_mf.data(), mf, DIMENSION * sizeof(float)); + float norm_mf = 0.0; + ailego::Normalizer::L2((float *)normalized_mf.data(), DIMENSION, + &norm_mf); + std::vector normalized_vec(DIMENSION); + memcpy(normalized_vec.data(), vec.data(), DIMENSION * sizeof(float)); + float norm_vec = 0.0; + ailego::Normalizer::L2((float *)normalized_vec.data(), DIMENSION, + &norm_vec); + + float v1 = ailego::Distance::MinusInnerProduct( + normalized_mf.data(), normalized_vec.data(), holder->dimension()); + float v2; + compute_batch(reinterpret_cast(&mi), qi, 1, + holder2->dimension(), &v2); + // printf("%f %f\n", v1, v2); + ASSERT_NEAR(v1, v2, 0.2 * DIMENSION); + + std::string out2; + ASSERT_EQ(0, reformer->convert(iter->data(), qmeta, &out2, &qmeta2)); + ASSERT_EQ(out2.size(), holder2->element_size()); + ASSERT_EQ(0, std::memcmp(out2.data(), iter2->data(), out2.size())); + } +} + +#endif \ No newline at end of file From 573d585a149ebc15c58eda37ba121d0e40928f20 Mon Sep 17 00:00:00 2001 From: ray Date: Fri, 27 Mar 2026 15:11:10 +0800 Subject: [PATCH 07/44] feat: add turbo ut --- tests/CMakeLists.txt | 1 + tests/turbo/CMakeLists.txt | 14 ++++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 tests/turbo/CMakeLists.txt diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 03250f1c8..54f917495 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -4,3 +4,4 @@ include(${PROJECT_ROOT_DIR}/cmake/option.cmake) cc_directories(ailego) cc_directories(db) cc_directories(core) +cc_directories(turbo) diff --git a/tests/turbo/CMakeLists.txt b/tests/turbo/CMakeLists.txt new file mode 100644 index 000000000..0e864858a --- /dev/null +++ b/tests/turbo/CMakeLists.txt @@ -0,0 +1,14 @@ +include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) + +file(GLOB_RECURSE ALL_TEST_SRCS *_test.cc) + +foreach(CC_SRCS ${ALL_TEST_SRCS}) + get_filename_component(CC_TARGET ${CC_SRCS} NAME_WE) + cc_gtest( + NAME ${CC_TARGET} + STRICT + LIBS zvec_ailego core_framework core_metric core_quantizer + SRCS ${CC_SRCS} + INCS . ${PROJECT_ROOT_DIR}/src/core/ + ) +endforeach() \ No newline at end of file From fdc0f35636731948a3168e9a1eb23489b88acc1e Mon Sep 17 00:00:00 2001 From: ray Date: Fri, 27 Mar 2026 18:13:43 +0800 Subject: [PATCH 08/44] feat: add int8/int4 avx2 sse --- .../record_quantized_int8/inner_product.cc | 22 ++ .../inner_product_common.h | 183 ++++++++++++++++- src/turbo/sse/record_quantized_int8/common.h | 189 +++++++++++++++++- .../record_quantized_int8/inner_product.cc | 22 ++ 4 files changed, 410 insertions(+), 6 deletions(-) diff --git a/src/turbo/avx2/record_quantized_int8/inner_product.cc b/src/turbo/avx2/record_quantized_int8/inner_product.cc index 19fe96c7d..34ba9edd4 100644 --- a/src/turbo/avx2/record_quantized_int8/inner_product.cc +++ b/src/turbo/avx2/record_quantized_int8/inner_product.cc @@ -26,7 +26,29 @@ namespace zvec::turbo::avx2 { void inner_product_int8_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX2__) + const size_t original_dim = dim - 20; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + original_dim * qb * mb); #else (void)a; (void)b; diff --git a/src/turbo/avx2/record_quantized_int8/inner_product_common.h b/src/turbo/avx2/record_quantized_int8/inner_product_common.h index 2c099ad13..e49b36dd3 100644 --- a/src/turbo/avx2/record_quantized_int8/inner_product_common.h +++ b/src/turbo/avx2/record_quantized_int8/inner_product_common.h @@ -30,14 +30,189 @@ namespace zvec::turbo::avx2::internal { -// Compute raw integer inner products for a batch of int8 vectors against a -// single query. Uses AVX512-VNNI dpbusd instruction. -// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) +#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001) + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast(m * q); + +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +static __attribute__((always_inline)) void inner_product_int8_avx2( + const void *a, const void *b, size_t size, float *distance) { + const int8_t *lhs = reinterpret_cast(a); + const int8_t *rhs = reinterpret_cast(b); + + const int8_t *last = lhs + size; + const int8_t *last_aligned = lhs + ((size >> 6) << 6); + float result = 0.0; + + __m256i ymm_sum_0 = _mm256_setzero_si256(); + __m256i ymm_sum_1 = _mm256_setzero_si256(); + + if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m256i ymm_lhs_0 = _mm256_load_si256((const __m256i *)(lhs + 0)); + __m256i ymm_lhs_1 = _mm256_load_si256((const __m256i *)(lhs + 32)); + __m256i ymm_rhs_0 = _mm256_load_si256((const __m256i *)(rhs + 0)); + __m256i ymm_rhs_1 = _mm256_load_si256((const __m256i *)(rhs + 32)); + + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); + + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), + ONES_INT16_AVX), + ymm_sum_0); + ymm_sum_1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), + ONES_INT16_AVX), + ymm_sum_1); + } + + if (last >= last_aligned + 32) { + __m256i ymm_lhs = _mm256_load_si256((const __m256i *)lhs); + __m256i ymm_rhs = _mm256_load_si256((const __m256i *)rhs); + ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); + ymm_rhs = _mm256_abs_epi8(ymm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), + ONES_INT16_AVX), + ymm_sum_0); + lhs += 32; + rhs += 32; + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_set_m128i(_mm_setzero_si128(), + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), + ONES_INT16_SSE)), + ymm_sum_0); + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m256i ymm_lhs_0 = _mm256_loadu_si256((const __m256i *)(lhs + 0)); + __m256i ymm_lhs_1 = _mm256_loadu_si256((const __m256i *)(lhs + 32)); + __m256i ymm_rhs_0 = _mm256_loadu_si256((const __m256i *)(rhs + 0)); + __m256i ymm_rhs_1 = _mm256_loadu_si256((const __m256i *)(rhs + 32)); + + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); + + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), + ONES_INT16_AVX), + ymm_sum_0); + ymm_sum_1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), + ONES_INT16_AVX), + ymm_sum_1); + } + + if (last >= last_aligned + 32) { + __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)lhs); + __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)rhs); + ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); + ymm_rhs = _mm256_abs_epi8(ymm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), + ONES_INT16_AVX), + ymm_sum_0); + lhs += 32; + rhs += 32; + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_set_m128i(_mm_setzero_si128(), + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), + ONES_INT16_SSE)), + ymm_sum_0); + lhs += 16; + rhs += 16; + } + } + result = static_cast( + HorizontalAdd_INT32_V256(_mm256_add_epi32(ymm_sum_0, ymm_sum_1))); + + switch (last - lhs) { + case 15: + FMA_INT8_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT8_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT8_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT8_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT8_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT8_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT8_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT8_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT8_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT8_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT8_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT8_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT8_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT8_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT8_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + template __attribute__((always_inline)) void inner_product_int8_batch_avx2_impl( const void *query, const void *const *vectors, const std::array &prefetch_ptrs, - size_t dimensionality, float *distances) {} + size_t dimensionality, float *distances) { + // TBD +} static __attribute__((always_inline)) void inner_product_int8_batch_avx2( const void *const *vectors, const void *query, size_t n, size_t dim, diff --git a/src/turbo/sse/record_quantized_int8/common.h b/src/turbo/sse/record_quantized_int8/common.h index cb9727491..1f44d04ab 100644 --- a/src/turbo/sse/record_quantized_int8/common.h +++ b/src/turbo/sse/record_quantized_int8/common.h @@ -24,10 +24,195 @@ #if defined(__SSE__) #include +#include +#include +#include -namespace zvec::turbo::avx512_vnni::sse { +namespace zvec::turbo::sse::internal { +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) -} // namespace zvec::turbo::avx512_vnni::sse +static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { +#ifdef __SSE3__ + __m128i x1 = _mm_hadd_epi32(v, v); + __m128i x2 = _mm_hadd_epi32(x1, x1); + return _mm_cvtsi128_si32(x2); +#else + __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); + __m128i x2 = _mm_add_epi32(v, x1); + __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); + __m128i x4 = _mm_add_epi32(x2, x3); + return _mm_cvtsi128_si32(x4); +#endif +} + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast(m * q); + +static __attribute__((always_inline)) void inner_product_int8_sse( + const void *a, const void *b, size_t size, float *distance) { + const int8_t *lhs = reinterpret_cast(a); + const int8_t *rhs = reinterpret_cast(b); + + const int8_t *last = lhs + size; + const int8_t *last_aligned = lhs + ((size >> 5) << 5); + + __m128i xmm_sum_0 = _mm_setzero_si128(); + __m128i xmm_sum_1 = _mm_setzero_si128(); + + if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + __m128i xmm_lhs_0 = _mm_load_si128((const __m128i *)(lhs + 0)); + __m128i xmm_lhs_1 = _mm_load_si128((const __m128i *)(lhs + 16)); + __m128i xmm_rhs_0 = _mm_load_si128((const __m128i *)(rhs + 0)); + __m128i xmm_rhs_1 = _mm_load_si128((const __m128i *)(rhs + 16)); + + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); + xmm_sum_0 = + _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), + ONES_INT16_SSE), + xmm_sum_0); + xmm_sum_1 = + _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), + ONES_INT16_SSE), + xmm_sum_1); + } + + if (last >= last_aligned + 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); + + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + xmm_sum_0 = _mm_add_epi32( + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), ONES_INT16_SSE), + xmm_sum_0); + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + __m128i xmm_lhs_0 = _mm_loadu_si128((const __m128i *)(lhs + 0)); + __m128i xmm_lhs_1 = _mm_loadu_si128((const __m128i *)(lhs + 16)); + __m128i xmm_rhs_0 = _mm_loadu_si128((const __m128i *)(rhs + 0)); + __m128i xmm_rhs_1 = _mm_loadu_si128((const __m128i *)(rhs + 16)); + + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); + xmm_sum_0 = + _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), + ONES_INT16_SSE), + xmm_sum_0); + xmm_sum_1 = + _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), + ONES_INT16_SSE), + xmm_sum_1); + } + + if (last >= last_aligned + 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); + + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + xmm_sum_0 = _mm_add_epi32( + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), ONES_INT16_SSE), + xmm_sum_0); + lhs += 16; + rhs += 16; + } + } + float result = static_cast( + HorizontalAdd_INT32_V128(_mm_add_epi32(xmm_sum_0, xmm_sum_1))); + + switch (last - lhs) { + case 15: + FMA_INT8_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT8_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT8_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT8_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT8_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT8_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT8_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT8_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT8_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT8_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT8_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT8_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT8_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT8_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT8_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +template +__attribute__((always_inline)) void inner_product_int8_batch_sse_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + // TBD +} + +static __attribute__((always_inline)) void inner_product_int8_batch_sse( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_int8_batch_sse_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_int8_batch_sse_impl<1>(query, &vectors[i], prefetch_ptrs, dim, + distances + i); + } +} + +} // namespace zvec::turbo::sse::internal #endif // defined(__SSE__) diff --git a/src/turbo/sse/record_quantized_int8/inner_product.cc b/src/turbo/sse/record_quantized_int8/inner_product.cc index 7c1bea677..6b6c4d9c1 100644 --- a/src/turbo/sse/record_quantized_int8/inner_product.cc +++ b/src/turbo/sse/record_quantized_int8/inner_product.cc @@ -26,7 +26,29 @@ namespace zvec::turbo::sse { void inner_product_int8_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__SSE__) + const size_t original_dim = dim - 20; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + original_dim * qb * mb); #else (void)a; (void)b; From 7be94e071955ef2b7337564d065cb1975cb3b441 Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 30 Mar 2026 21:02:02 +0800 Subject: [PATCH 09/44] feat: add dist --- src/turbo/avx2/float32/cosine.cc | 49 ++++ src/turbo/avx2/float32/cosine.h | 30 ++ src/turbo/avx2/float32/inner_product.cc | 53 ++++ src/turbo/avx2/float32/inner_product.h | 31 +++ src/turbo/avx2/float32/inner_product_common.h | 258 ++++++++++++++++++ src/turbo/avx2/float32/squared_euclidean.cc | 48 ++++ src/turbo/avx2/float32/squared_euclidean.h | 31 +++ src/turbo/scalar/float32/cosine.cc | 25 ++ src/turbo/scalar/float32/cosine.h | 30 ++ src/turbo/scalar/float32/inner_product.cc | 29 ++ src/turbo/scalar/float32/inner_product.h | 31 +++ src/turbo/scalar/float32/squared_euclidean.cc | 26 ++ src/turbo/scalar/float32/squared_euclidean.h | 31 +++ 13 files changed, 672 insertions(+) create mode 100644 src/turbo/avx2/float32/cosine.cc create mode 100644 src/turbo/avx2/float32/cosine.h create mode 100644 src/turbo/avx2/float32/inner_product.cc create mode 100644 src/turbo/avx2/float32/inner_product.h create mode 100644 src/turbo/avx2/float32/inner_product_common.h create mode 100644 src/turbo/avx2/float32/squared_euclidean.cc create mode 100644 src/turbo/avx2/float32/squared_euclidean.h create mode 100644 src/turbo/scalar/float32/cosine.cc create mode 100644 src/turbo/scalar/float32/cosine.h create mode 100644 src/turbo/scalar/float32/inner_product.cc create mode 100644 src/turbo/scalar/float32/inner_product.h create mode 100644 src/turbo/scalar/float32/squared_euclidean.cc create mode 100644 src/turbo/scalar/float32/squared_euclidean.h diff --git a/src/turbo/avx2/float32/cosine.cc b/src/turbo/avx2/float32/cosine.cc new file mode 100644 index 000000000..0b77c170b --- /dev/null +++ b/src/turbo/avx2/float32/cosine.cc @@ -0,0 +1,49 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/float32/cosine.h" +#include "avx2/float32/inner_product_common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX2__ +} + +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX2__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/float32/cosine.h b/src/turbo/avx2/float32/cosine.h new file mode 100644 index 000000000..370724ddd --- /dev/null +++ b/src/turbo/avx2/float32/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/float32/inner_product.cc b/src/turbo/avx2/float32/inner_product.cc new file mode 100644 index 000000000..bf8d5290a --- /dev/null +++ b/src/turbo/avx2/float32/inner_product.cc @@ -0,0 +1,53 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/record_quantized_int4/inner_product.h" +#include "avx2/record_quantized_int4/inner_product_common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +// Compute squared Euclidean distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif //__AVX2__ +} + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX2__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/float32/inner_product.h b/src/turbo/avx2/float32/inner_product.h new file mode 100644 index 000000000..a98659a26 --- /dev/null +++ b/src/turbo/avx2/float32/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx2 diff --git a/src/turbo/avx2/float32/inner_product_common.h b/src/turbo/avx2/float32/inner_product_common.h new file mode 100644 index 000000000..6d12504e3 --- /dev/null +++ b/src/turbo/avx2/float32/inner_product_common.h @@ -0,0 +1,258 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX2__) +#include +#include +#include +#include + +namespace zvec::turbo::avx2::internal { + + +/*! Four-bits Integer Multiplication Table + */ +static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, + 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, + 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, + 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, + 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, + 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, + 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, + 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, + 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, + 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, + 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, + 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, + 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, + 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, +}; + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT4_GENERAL(m, q, sum) \ + sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \ + Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)]; + +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f) +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) + +#define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0) +#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001) + +static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}; + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) + +#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable) + +//! Compute the distance between matrix and query +#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) \ + { \ + __m128i xmm_lhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE)); \ + __m128i xmm_rhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE)); \ + __m128i xmm_lhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE)); \ + __m128i xmm_rhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE)); \ + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); \ + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); \ + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); \ + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); \ + xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), \ + ONES_INT16_SSE); \ + xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), \ + ONES_INT16_SSE); \ + xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \ + } + +#define FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) \ + { \ + __m256i ymm_lhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_lhs), MASK_INT4_AVX)); \ + __m256i ymm_rhs_0 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, _mm256_and_si256((ymm_rhs), MASK_INT4_AVX)); \ + __m256i ymm_lhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_lhs), 4), MASK_INT4_AVX)); \ + __m256i ymm_rhs_1 = _mm256_shuffle_epi8( \ + INT4_LOOKUP_AVX, \ + _mm256_and_si256(_mm256_srli_epi32((ymm_rhs), 4), MASK_INT4_AVX)); \ + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); \ + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); \ + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); \ + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); \ + ymm_lhs_0 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), \ + ONES_INT16_AVX); \ + ymm_lhs_1 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), \ + ONES_INT16_AVX); \ + ymm_sum = \ + _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum); \ + } + +#if defined(__SSE2__) +static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { +#ifdef __SSE3__ + __m128i x1 = _mm_hadd_epi32(v, v); + __m128i x2 = _mm_hadd_epi32(x1, x1); + return _mm_cvtsi128_si32(x2); +#else + __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); + __m128i x2 = _mm_add_epi32(v, x1); + __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); + __m128i x4 = _mm_add_epi32(x2, x3); + return _mm_cvtsi128_si32(x4); +#endif +} +#endif // __SSE2__ + +//! Compute the distance between matrix and query +static __attribute__((always_inline)) void inner_product_int4_avx2( + const void *a, const void *b, size_t size, float *distance) { + const uint8_t *lhs = reinterpret_cast(a); + const uint8_t *rhs = reinterpret_cast(b); + const uint8_t *last = lhs + size; + const uint8_t *last_aligned = lhs + ((size >> 4) << 4); + __m128i xmm_sum = _mm_setzero_si128(); + + if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } + float result = static_cast(HorizontalAdd_INT32_V128(xmm_sum)); + + switch (last - lhs) { + case 15: + FMA_INT4_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT4_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT4_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT4_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT4_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT4_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT4_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT4_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT4_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT4_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT4_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT4_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT4_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT4_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT4_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +// Compute raw integer inner products for a batch of int8 vectors against a +// single query. Uses AVX512-VNNI dpbusd instruction. +// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. +template +__attribute__((always_inline)) void inner_product_int4_batch_avx2_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) {} + +static __attribute__((always_inline)) void inner_product_int4_batch_avx2( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_int4_batch_avx2_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); + } +} + +} // namespace zvec::turbo::avx2::internal + +#endif // defined(__AVX2__) diff --git a/src/turbo/avx2/float32/squared_euclidean.cc b/src/turbo/avx2/float32/squared_euclidean.cc new file mode 100644 index 000000000..7900c827f --- /dev/null +++ b/src/turbo/avx2/float32/squared_euclidean.cc @@ -0,0 +1,48 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx2/float32/squared_euclidean.h" +#include "avx2/float32/inner_product_common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx2 { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX2__) +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX2__ +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX2__) +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx2 \ No newline at end of file diff --git a/src/turbo/avx2/float32/squared_euclidean.h b/src/turbo/avx2/float32/squared_euclidean.h new file mode 100644 index 000000000..f2a1402cc --- /dev/null +++ b/src/turbo/avx2/float32/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx2 { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx2 diff --git a/src/turbo/scalar/float32/cosine.cc b/src/turbo/scalar/float32/cosine.cc new file mode 100644 index 000000000..f4d1db6e8 --- /dev/null +++ b/src/turbo/scalar/float32/cosine.cc @@ -0,0 +1,25 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/float32/cosine.h" + +namespace zvec::turbo::scalar { + +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) {} + +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) {} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float32/cosine.h b/src/turbo/scalar/float32/cosine.h new file mode 100644 index 000000000..b5e4f4eee --- /dev/null +++ b/src/turbo/scalar/float32/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float32/inner_product.cc b/src/turbo/scalar/float32/inner_product.cc new file mode 100644 index 000000000..5dd945b7a --- /dev/null +++ b/src/turbo/scalar/float32/inner_product.cc @@ -0,0 +1,29 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/float32/inner_product.h" + +namespace zvec::turbo::scalar { + +// Compute squared Euclidean distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) {} + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) {} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float32/inner_product.h b/src/turbo/scalar/float32/inner_product.h new file mode 100644 index 000000000..d4e03418e --- /dev/null +++ b/src/turbo/scalar/float32/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/float32/squared_euclidean.cc b/src/turbo/scalar/float32/squared_euclidean.cc new file mode 100644 index 000000000..e89e01c18 --- /dev/null +++ b/src/turbo/scalar/float32/squared_euclidean.cc @@ -0,0 +1,26 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/float32/squared_euclidean.h" + +namespace zvec::turbo::scalar { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) {} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) {} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float32/squared_euclidean.h b/src/turbo/scalar/float32/squared_euclidean.h new file mode 100644 index 000000000..bf319c1d2 --- /dev/null +++ b/src/turbo/scalar/float32/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::scalar From 4d21dd82fdf8583d8537d264b6f0c579b1d983c3 Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 31 Mar 2026 11:50:37 +0800 Subject: [PATCH 10/44] feat: add dist func --- src/include/zvec/turbo/turbo.h | 4 + src/turbo/avx/float32/common.h | 23 ++ src/turbo/avx/float32/cosine.cc | 49 ++++ src/turbo/{avx2 => avx}/float32/cosine.h | 4 +- .../{avx2 => avx}/float32/inner_product.cc | 0 .../{avx2 => avx}/float32/inner_product.h | 0 .../float32/squared_euclidean.cc | 18 +- .../{avx2 => avx}/float32/squared_euclidean.h | 4 +- src/turbo/avx2/float32/inner_product_common.h | 258 ------------------ .../record_quantized_int8/squared_euclidean.h | 2 +- src/turbo/avx512/float32/common.h | 11 - src/turbo/{avx2 => avx512}/float32/cosine.cc | 10 +- src/turbo/avx512/float32/cosine.h | 30 ++ src/turbo/avx512/float32/inner_product.cc | 53 ++++ src/turbo/avx512/float32/inner_product.h | 31 +++ src/turbo/avx512/float32/squared_euclidean.cc | 48 ++++ src/turbo/avx512/float32/squared_euclidean.h | 31 +++ .../scalar/record_quantized_int4/common.h | 23 ++ .../scalar/record_quantized_int4/cosine.cc | 37 +++ .../scalar/record_quantized_int4/cosine.h | 30 ++ .../record_quantized_int4/inner_product.cc | 41 +++ .../record_quantized_int4/inner_product.h | 31 +++ .../squared_euclidean.cc | 38 +++ .../record_quantized_int4/squared_euclidean.h | 31 +++ .../scalar/record_quantized_int8/common.h | 23 ++ .../scalar/record_quantized_int8/cosine.cc | 37 +++ .../scalar/record_quantized_int8/cosine.h | 30 ++ .../record_quantized_int8/inner_product.cc | 41 +++ .../record_quantized_int8/inner_product.h | 31 +++ .../squared_euclidean.cc | 38 +++ .../record_quantized_int8/squared_euclidean.h | 31 +++ src/turbo/turbo.cc | 111 ++++++++ tests/turbo/quantized_integer_test.cc | 184 +++++-------- 33 files changed, 922 insertions(+), 411 deletions(-) create mode 100644 src/turbo/avx/float32/common.h create mode 100644 src/turbo/avx/float32/cosine.cc rename src/turbo/{avx2 => avx}/float32/cosine.h (94%) rename src/turbo/{avx2 => avx}/float32/inner_product.cc (100%) rename src/turbo/{avx2 => avx}/float32/inner_product.h (100%) rename src/turbo/{avx2 => avx}/float32/squared_euclidean.cc (81%) rename src/turbo/{avx2 => avx}/float32/squared_euclidean.h (94%) delete mode 100644 src/turbo/avx2/float32/inner_product_common.h rename src/turbo/{avx2 => avx512}/float32/cosine.cc (87%) create mode 100644 src/turbo/avx512/float32/cosine.h create mode 100644 src/turbo/avx512/float32/inner_product.cc create mode 100644 src/turbo/avx512/float32/inner_product.h create mode 100644 src/turbo/avx512/float32/squared_euclidean.cc create mode 100644 src/turbo/avx512/float32/squared_euclidean.h create mode 100644 src/turbo/scalar/record_quantized_int4/common.h create mode 100644 src/turbo/scalar/record_quantized_int4/cosine.cc create mode 100644 src/turbo/scalar/record_quantized_int4/cosine.h create mode 100644 src/turbo/scalar/record_quantized_int4/inner_product.cc create mode 100644 src/turbo/scalar/record_quantized_int4/inner_product.h create mode 100644 src/turbo/scalar/record_quantized_int4/squared_euclidean.cc create mode 100644 src/turbo/scalar/record_quantized_int4/squared_euclidean.h create mode 100644 src/turbo/scalar/record_quantized_int8/common.h create mode 100644 src/turbo/scalar/record_quantized_int8/cosine.cc create mode 100644 src/turbo/scalar/record_quantized_int8/cosine.h create mode 100644 src/turbo/scalar/record_quantized_int8/inner_product.cc create mode 100644 src/turbo/scalar/record_quantized_int8/inner_product.h create mode 100644 src/turbo/scalar/record_quantized_int8/squared_euclidean.cc create mode 100644 src/turbo/scalar/record_quantized_int8/squared_euclidean.h diff --git a/src/include/zvec/turbo/turbo.h b/src/include/zvec/turbo/turbo.h index 098067428..70ddabd6d 100644 --- a/src/include/zvec/turbo/turbo.h +++ b/src/include/zvec/turbo/turbo.h @@ -36,6 +36,8 @@ enum class MetricType { enum class DataType { kInt4, kInt8, + kFp16, + kFp32, kUnknown, }; @@ -45,7 +47,9 @@ enum class QuantizeType { enum class CpuArchType { kAuto, + kScalar, kSSE, + kAVX, kAVX2, kAVX512, kAVX512VNNI, diff --git a/src/turbo/avx/float32/common.h b/src/turbo/avx/float32/common.h new file mode 100644 index 000000000..13be3a2bf --- /dev/null +++ b/src/turbo/avx/float32/common.h @@ -0,0 +1,23 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once diff --git a/src/turbo/avx/float32/cosine.cc b/src/turbo/avx/float32/cosine.cc new file mode 100644 index 000000000..838e6f6ff --- /dev/null +++ b/src/turbo/avx/float32/cosine.cc @@ -0,0 +1,49 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/float32/cosine.h" +#include "avx/float32/inner_product_common.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx2/float32/cosine.h b/src/turbo/avx/float32/cosine.h similarity index 94% rename from src/turbo/avx2/float32/cosine.h rename to src/turbo/avx/float32/cosine.h index 370724ddd..514a705e0 100644 --- a/src/turbo/avx2/float32/cosine.h +++ b/src/turbo/avx/float32/cosine.h @@ -16,7 +16,7 @@ #include -namespace zvec::turbo::avx2 { +namespace zvec::turbo::avx { // Compute cosine distance (negative inner product after normalization) between // a single quantized FP32 vector pair. @@ -27,4 +27,4 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, void cosine_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); -} // namespace zvec::turbo::avx2 \ No newline at end of file +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx2/float32/inner_product.cc b/src/turbo/avx/float32/inner_product.cc similarity index 100% rename from src/turbo/avx2/float32/inner_product.cc rename to src/turbo/avx/float32/inner_product.cc diff --git a/src/turbo/avx2/float32/inner_product.h b/src/turbo/avx/float32/inner_product.h similarity index 100% rename from src/turbo/avx2/float32/inner_product.h rename to src/turbo/avx/float32/inner_product.h diff --git a/src/turbo/avx2/float32/squared_euclidean.cc b/src/turbo/avx/float32/squared_euclidean.cc similarity index 81% rename from src/turbo/avx2/float32/squared_euclidean.cc rename to src/turbo/avx/float32/squared_euclidean.cc index 7900c827f..3bd1937d1 100644 --- a/src/turbo/avx2/float32/squared_euclidean.cc +++ b/src/turbo/avx/float32/squared_euclidean.cc @@ -12,37 +12,37 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx2/float32/squared_euclidean.h" -#include "avx2/float32/inner_product_common.h" +#include "avx/float32/squared_euclidean.h" +#include "avx/float32/inner_product_common.h" -#if defined(__AVX2__) +#if defined(__AVX__) #include #endif -namespace zvec::turbo::avx2 { +namespace zvec::turbo::avx { void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__AVX2__) +#if defined(__AVX__) #else (void)a; (void)b; (void)dim; (void)distance; -#endif // __AVX2__ +#endif // __AVX__ } void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__AVX2__) +#if defined(__AVX__) #else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; -#endif //__AVX2__ +#endif //__AVX__ } -} // namespace zvec::turbo::avx2 \ No newline at end of file +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx2/float32/squared_euclidean.h b/src/turbo/avx/float32/squared_euclidean.h similarity index 94% rename from src/turbo/avx2/float32/squared_euclidean.h rename to src/turbo/avx/float32/squared_euclidean.h index f2a1402cc..9e11f15bc 100644 --- a/src/turbo/avx2/float32/squared_euclidean.h +++ b/src/turbo/avx/float32/squared_euclidean.h @@ -16,7 +16,7 @@ #include -namespace zvec::turbo::avx2 { +namespace zvec::turbo::avx { // Compute squared euclidean distance between a single quantized FP32 // vector pair. @@ -28,4 +28,4 @@ void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); -} // namespace zvec::turbo::avx2 +} // namespace zvec::turbo::avx diff --git a/src/turbo/avx2/float32/inner_product_common.h b/src/turbo/avx2/float32/inner_product_common.h deleted file mode 100644 index 6d12504e3..000000000 --- a/src/turbo/avx2/float32/inner_product_common.h +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright 2025-present the zvec project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - -#pragma once - -#if defined(__AVX2__) -#include -#include -#include -#include - -namespace zvec::turbo::avx2::internal { - - -/*! Four-bits Integer Multiplication Table - */ -static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, - 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, - 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, - 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, - 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, - 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, - 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, - 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, - 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, - 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, - 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, - 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, - 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, - 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, - 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, -}; - -//! Calculate Fused-Multiply-Add (GENERAL) -#define FMA_INT4_GENERAL(m, q, sum) \ - sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \ - Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)]; - -static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { - __m256i x1 = _mm256_hadd_epi32(v, v); - __m256i x2 = _mm256_hadd_epi32(x1, x1); - __m128i x3 = _mm256_extractf128_si256(x2, 1); - __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); - return _mm_cvtsi128_si32(x4); -} - -#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f) -#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) - -#define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0) -#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001) - -static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { - 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, - 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}; - -#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) - -#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) - -#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable) - -//! Compute the distance between matrix and query -#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) \ - { \ - __m128i xmm_lhs_0 = _mm_shuffle_epi8( \ - INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE)); \ - __m128i xmm_rhs_0 = _mm_shuffle_epi8( \ - INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE)); \ - __m128i xmm_lhs_1 = _mm_shuffle_epi8( \ - INT4_LOOKUP_SSE, \ - _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE)); \ - __m128i xmm_rhs_1 = _mm_shuffle_epi8( \ - INT4_LOOKUP_SSE, \ - _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE)); \ - xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); \ - xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); \ - xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); \ - xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); \ - xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), \ - ONES_INT16_SSE); \ - xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), \ - ONES_INT16_SSE); \ - xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \ - } - -#define FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) \ - { \ - __m256i ymm_lhs_0 = _mm256_shuffle_epi8( \ - INT4_LOOKUP_AVX, _mm256_and_si256((ymm_lhs), MASK_INT4_AVX)); \ - __m256i ymm_rhs_0 = _mm256_shuffle_epi8( \ - INT4_LOOKUP_AVX, _mm256_and_si256((ymm_rhs), MASK_INT4_AVX)); \ - __m256i ymm_lhs_1 = _mm256_shuffle_epi8( \ - INT4_LOOKUP_AVX, \ - _mm256_and_si256(_mm256_srli_epi32((ymm_lhs), 4), MASK_INT4_AVX)); \ - __m256i ymm_rhs_1 = _mm256_shuffle_epi8( \ - INT4_LOOKUP_AVX, \ - _mm256_and_si256(_mm256_srli_epi32((ymm_rhs), 4), MASK_INT4_AVX)); \ - ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); \ - ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); \ - ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); \ - ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); \ - ymm_lhs_0 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), \ - ONES_INT16_AVX); \ - ymm_lhs_1 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), \ - ONES_INT16_AVX); \ - ymm_sum = \ - _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum); \ - } - -#if defined(__SSE2__) -static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { -#ifdef __SSE3__ - __m128i x1 = _mm_hadd_epi32(v, v); - __m128i x2 = _mm_hadd_epi32(x1, x1); - return _mm_cvtsi128_si32(x2); -#else - __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); - __m128i x2 = _mm_add_epi32(v, x1); - __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); - __m128i x4 = _mm_add_epi32(x2, x3); - return _mm_cvtsi128_si32(x4); -#endif -} -#endif // __SSE2__ - -//! Compute the distance between matrix and query -static __attribute__((always_inline)) void inner_product_int4_avx2( - const void *a, const void *b, size_t size, float *distance) { - const uint8_t *lhs = reinterpret_cast(a); - const uint8_t *rhs = reinterpret_cast(b); - const uint8_t *last = lhs + size; - const uint8_t *last_aligned = lhs + ((size >> 4) << 4); - __m128i xmm_sum = _mm_setzero_si128(); - - if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { - for (; lhs != last_aligned; lhs += 16, rhs += 16) { - __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs)); - __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs)); - FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) - } - } else { - for (; lhs != last_aligned; lhs += 16, rhs += 16) { - __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs)); - __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs)); - FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) - } - } - float result = static_cast(HorizontalAdd_INT32_V128(xmm_sum)); - - switch (last - lhs) { - case 15: - FMA_INT4_GENERAL(lhs[14], rhs[14], result) - /* FALLTHRU */ - case 14: - FMA_INT4_GENERAL(lhs[13], rhs[13], result) - /* FALLTHRU */ - case 13: - FMA_INT4_GENERAL(lhs[12], rhs[12], result) - /* FALLTHRU */ - case 12: - FMA_INT4_GENERAL(lhs[11], rhs[11], result) - /* FALLTHRU */ - case 11: - FMA_INT4_GENERAL(lhs[10], rhs[10], result) - /* FALLTHRU */ - case 10: - FMA_INT4_GENERAL(lhs[9], rhs[9], result) - /* FALLTHRU */ - case 9: - FMA_INT4_GENERAL(lhs[8], rhs[8], result) - /* FALLTHRU */ - case 8: - FMA_INT4_GENERAL(lhs[7], rhs[7], result) - /* FALLTHRU */ - case 7: - FMA_INT4_GENERAL(lhs[6], rhs[6], result) - /* FALLTHRU */ - case 6: - FMA_INT4_GENERAL(lhs[5], rhs[5], result) - /* FALLTHRU */ - case 5: - FMA_INT4_GENERAL(lhs[4], rhs[4], result) - /* FALLTHRU */ - case 4: - FMA_INT4_GENERAL(lhs[3], rhs[3], result) - /* FALLTHRU */ - case 3: - FMA_INT4_GENERAL(lhs[2], rhs[2], result) - /* FALLTHRU */ - case 2: - FMA_INT4_GENERAL(lhs[1], rhs[1], result) - /* FALLTHRU */ - case 1: - FMA_INT4_GENERAL(lhs[0], rhs[0], result) - } - - *distance = result; -} - -// Compute raw integer inner products for a batch of int8 vectors against a -// single query. Uses AVX512-VNNI dpbusd instruction. -// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. -template -__attribute__((always_inline)) void inner_product_int4_batch_avx2_impl( - const void *query, const void *const *vectors, - const std::array &prefetch_ptrs, - size_t dimensionality, float *distances) {} - -static __attribute__((always_inline)) void inner_product_int4_batch_avx2( - const void *const *vectors, const void *query, size_t n, size_t dim, - float *distances) { - static constexpr size_t batch_size = 2; - static constexpr size_t prefetch_step = 2; - size_t i = 0; - for (; i + batch_size <= n; i += batch_size) { - std::array prefetch_ptrs; - for (size_t j = 0; j < batch_size; ++j) { - if (i + j + batch_size * prefetch_step < n) { - prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; - } else { - prefetch_ptrs[j] = nullptr; - } - } - inner_product_int4_batch_avx2_impl( - query, &vectors[i], prefetch_ptrs, dim, distances + i); - } - for (; i < n; i++) { - std::array prefetch_ptrs{nullptr}; - inner_product_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, - dim, distances + i); - } -} - -} // namespace zvec::turbo::avx2::internal - -#endif // defined(__AVX2__) diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean.h b/src/turbo/avx2/record_quantized_int8/squared_euclidean.h index 40d8a1baf..1bbfa6676 100644 --- a/src/turbo/avx2/record_quantized_int8/squared_euclidean.h +++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean.h @@ -23,7 +23,7 @@ namespace zvec::turbo::avx2 { void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, float *distance); -// Batch version of squared euclidean INT4. +// Batch version of squared euclidean INT8. void squared_euclidean_int8_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); diff --git a/src/turbo/avx512/float32/common.h b/src/turbo/avx512/float32/common.h index 35dbf1f08..13be3a2bf 100644 --- a/src/turbo/avx512/float32/common.h +++ b/src/turbo/avx512/float32/common.h @@ -21,14 +21,3 @@ // overhead. #pragma once - -#if defined(__AVX512VNNI__) -#include -#include -#include - -namespace zvec::turbo::avx512_vnni::internal { - -} // namespace zvec::turbo::avx512_vnni::internal - -#endif // defined(__AVX512VNNI__) diff --git a/src/turbo/avx2/float32/cosine.cc b/src/turbo/avx512/float32/cosine.cc similarity index 87% rename from src/turbo/avx2/float32/cosine.cc rename to src/turbo/avx512/float32/cosine.cc index 0b77c170b..9eb6b5b00 100644 --- a/src/turbo/avx2/float32/cosine.cc +++ b/src/turbo/avx512/float32/cosine.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx2/float32/cosine.h" -#include "avx2/float32/inner_product_common.h" +#include "avx512/float32/cosine.h" +#include "avx512/float32/common.h" -#if defined(__AVX2__) +#if defined(__AVX512__) #include #endif -namespace zvec::turbo::avx2 { +namespace zvec::turbo::avx512 { void cosine_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { @@ -46,4 +46,4 @@ void cosine_fp32_batch_distance(const void *const *vectors, const void *query, #endif //__AVX2__ } -} // namespace zvec::turbo::avx2 \ No newline at end of file +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/float32/cosine.h b/src/turbo/avx512/float32/cosine.h new file mode 100644 index 000000000..7e11de89f --- /dev/null +++ b/src/turbo/avx512/float32/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/float32/inner_product.cc b/src/turbo/avx512/float32/inner_product.cc new file mode 100644 index 000000000..f9086f11b --- /dev/null +++ b/src/turbo/avx512/float32/inner_product.cc @@ -0,0 +1,53 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512/float32/inner_product.h" +#include "avx512/float32/common.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec::turbo::avx512 { + +// Compute squared Euclidean distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif //__AVX2__ +} + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { +#if defined(__AVX512__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX2__ +} + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/float32/inner_product.h b/src/turbo/avx512/float32/inner_product.h new file mode 100644 index 000000000..d1f48eecf --- /dev/null +++ b/src/turbo/avx512/float32/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512 { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx512 diff --git a/src/turbo/avx512/float32/squared_euclidean.cc b/src/turbo/avx512/float32/squared_euclidean.cc new file mode 100644 index 000000000..9a21ced80 --- /dev/null +++ b/src/turbo/avx512/float32/squared_euclidean.cc @@ -0,0 +1,48 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512/float32/squared_euclidean.h" +#include "avx512/float32/common.h" + +#if defined(__AVX512__) +#include +#endif + +namespace zvec::turbo::avx512 { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512__) +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX512__ +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX512__) +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX512__ +} + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/float32/squared_euclidean.h b/src/turbo/avx512/float32/squared_euclidean.h new file mode 100644 index 000000000..8b43b540e --- /dev/null +++ b/src/turbo/avx512/float32/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512 { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx512 diff --git a/src/turbo/scalar/record_quantized_int4/common.h b/src/turbo/scalar/record_quantized_int4/common.h new file mode 100644 index 000000000..13be3a2bf --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/common.h @@ -0,0 +1,23 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once diff --git a/src/turbo/scalar/record_quantized_int4/cosine.cc b/src/turbo/scalar/record_quantized_int4/cosine.cc new file mode 100644 index 000000000..ad6105d31 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/cosine.cc @@ -0,0 +1,37 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int4/cosine.h" +#include "scalar/record_quantized_int4/common.h" + +namespace zvec::turbo::scalar { + +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { + (void)a; + (void)b; + (void)dim; + (void)distance; +} + +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/cosine.h b/src/turbo/scalar/record_quantized_int4/cosine.h new file mode 100644 index 000000000..25838aa02 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized int4 vector pair. +void cosine_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int4_distance. +void cosine_int4_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/inner_product.cc b/src/turbo/scalar/record_quantized_int4/inner_product.cc new file mode 100644 index 000000000..f3e183f20 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/inner_product.cc @@ -0,0 +1,41 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int4/inner_product.h" +#include "scalar/record_quantized_int4/common.h" + +namespace zvec::turbo::scalar { + +// Compute squared Euclidean distance between a single quantized int4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { + (void)a; + (void)b; + (void)dim; + (void)distance; +} + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/inner_product.h b/src/turbo/scalar/record_quantized_int4/inner_product.h new file mode 100644 index 000000000..b34d47aa4 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute inner product distance between a single quantized int4 +// vector pair. +void inner_product_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int4_distance. +void inner_product_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc new file mode 100644 index 000000000..555cc85a5 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc @@ -0,0 +1,38 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int4/squared_euclidean.h" +#include "scalar/record_quantized_int4/common.h" + +namespace zvec::turbo::scalar { + +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance) { + (void)a; + (void)b; + (void)dim; + (void)distance; +} + +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/squared_euclidean.h b/src/turbo/scalar/record_quantized_int4/squared_euclidean.h new file mode 100644 index 000000000..ea37cfdec --- /dev/null +++ b/src/turbo/scalar/record_quantized_int4/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute squared euclidean distance between a single quantized INT8 +// vector pair. +void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean INT8. +void squared_euclidean_int4_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/record_quantized_int8/common.h b/src/turbo/scalar/record_quantized_int8/common.h new file mode 100644 index 000000000..13be3a2bf --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/common.h @@ -0,0 +1,23 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once diff --git a/src/turbo/scalar/record_quantized_int8/cosine.cc b/src/turbo/scalar/record_quantized_int8/cosine.cc new file mode 100644 index 000000000..221068437 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/cosine.cc @@ -0,0 +1,37 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int8/cosine.h" +#include "scalar/record_quantized_int8/common.h" + +namespace zvec::turbo::scalar { + +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { + (void)a; + (void)b; + (void)dim; + (void)distance; +} + +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/cosine.h b/src/turbo/scalar/record_quantized_int8/cosine.h new file mode 100644 index 000000000..e06d8b234 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized int8 vector pair. +void cosine_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_int8_distance. +void cosine_int8_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/inner_product.cc b/src/turbo/scalar/record_quantized_int8/inner_product.cc new file mode 100644 index 000000000..1927d97dd --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/inner_product.cc @@ -0,0 +1,41 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int8/inner_product.h" +#include "scalar/record_quantized_int8/common.h" + +namespace zvec::turbo::scalar { + +// Compute squared Euclidean distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { + (void)a; + (void)b; + (void)dim; + (void)distance; +} + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/inner_product.h b/src/turbo/scalar/record_quantized_int8/inner_product.h new file mode 100644 index 000000000..1ed51489a --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute inner product distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc new file mode 100644 index 000000000..aa8b7be66 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc @@ -0,0 +1,38 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/record_quantized_int8/squared_euclidean.h" +#include "scalar/record_quantized_int8/common.h" + +namespace zvec::turbo::scalar { + +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { + (void)a; + (void)b; + (void)dim; + (void)distance; +} + +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/squared_euclidean.h b/src/turbo/scalar/record_quantized_int8/squared_euclidean.h new file mode 100644 index 000000000..07db60519 --- /dev/null +++ b/src/turbo/scalar/record_quantized_int8/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute squared euclidean distance between a single quantized INT8 +// vector pair. +void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean INT8. +void squared_euclidean_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index d135d2fe0..8bd3ac068 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -22,6 +22,12 @@ #include "avx2/record_quantized_int8/squared_euclidean.h" #include "avx512_vnni/record_quantized_int8/cosine.h" #include "avx512_vnni/record_quantized_int8/squared_euclidean.h" +#include "scalar/record_quantized_int4/cosine.h" +#include "scalar/record_quantized_int4/inner_product.h" +#include "scalar/record_quantized_int4/squared_euclidean.h" +#include "scalar/record_quantized_int8/cosine.h" +#include "scalar/record_quantized_int8/inner_product.h" +#include "scalar/record_quantized_int8/squared_euclidean.h" #include "sse/record_quantized_int4/cosine.h" #include "sse/record_quantized_int4/inner_product.h" #include "sse/record_quantized_int4/squared_euclidean.h" @@ -77,6 +83,17 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, return sse::inner_product_int8_distance; } } + + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_int8_distance; + } + if (metric_type == MetricType::kCosine) { + return scalar::cosine_int8_distance; + } + + if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_int8_distance; + } } } @@ -96,9 +113,93 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, return avx2::inner_product_int4_distance; } } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kSSE)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return sse::squared_euclidean_int4_distance; + } + if (metric_type == MetricType::kCosine) { + return sse::cosine_int4_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return sse::inner_product_int4_distance; + } + } + + // if (metric_type == MetricType::kSquaredEuclidean) { + // return scalar::squared_euclidean_int4_distance; + // } + // else if (metric_type == MetricType::kCosine) { + // return scalar::cosine_int4_distance; + // } + // else if (metric_type == MetricType::kInnerProduct) { + // return scalar::inner_product_int4_distance; + // } + } + } + + // FP32 + if (data_type == DataType::kFp32) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx512::squared_euclidean_fp32_distance; + } + if (metric_type == MetricType::kCosine) { + return avx512::cosine_fp32_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx512::inner_product_fp32_distance; + } + } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx::squared_euclidean_fp32_distance; + } + if (metric_type == MetricType::kCosine) { + return avx::cosine_fp32_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx::inner_product_fp32_distance; + } + } + + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_fp32_distance; + } + if (metric_type == MetricType::kCosine) { + return scalar::cosine_fp32_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_fp32_distance; + } } + } + // FP16 + if (data_type == DataType::kFp16) { if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX2)) { + if (metric_type == MetricType::kSquaredEuclidean) { + return avx2::squared_euclidean_int4_distance; + } + if (metric_type == MetricType::kCosine) { + return avx2::cosine_int4_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx2::inner_product_int4_distance; + } + } + if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE && (cpu_arch_type == CpuArchType::kAuto || cpu_arch_type == CpuArchType::kSSE)) { @@ -112,6 +213,16 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, return sse::inner_product_int4_distance; } } + + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_int4_distance; + } + if (metric_type == MetricType::kCosine) { + return scalar::cosine_int4_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_int4_distance; + } } } return nullptr; diff --git a/tests/turbo/quantized_integer_test.cc b/tests/turbo/quantized_integer_test.cc index 9a7ecac23..94167557c 100644 --- a/tests/turbo/quantized_integer_test.cc +++ b/tests/turbo/quantized_integer_test.cc @@ -40,6 +40,9 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { auto &convert_meta = converter->meta(); auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + auto func_float = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); auto func_avx2 = turbo::get_distance_func( turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, @@ -49,6 +52,10 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + ailego::NumericalVector query_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { query_vec[j] = dist(gen); @@ -77,159 +84,90 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { float score_float = ailego::Distance::MinusInnerProduct( query_vec.data(), doc_vec.data(), DIMENSION); + func_float(query_vec.data(), doc_vec.data(), DIMENSION, &score_float); + + float score_scalar{0.0f}; float score_avx2{0.0f}; float score_sse{0.0f}; + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_avx2); + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_sse); ASSERT_NEAR(score_float, score_avx2, 0.2 * DIMENSION); ASSERT_NEAR(score_float, score_sse, 0.2 * DIMENSION); - ASSERT_NEAR(score_avx2, score_sse, 0.001); + ASSERT_NEAR(score_float, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); } } -#if 0 TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); - const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); const size_t COUNT = 1000; - IndexMeta meta; - meta.set_meta(IndexMeta::DT_FP32, DIMENSION); - meta.set_metric("InnerProduct", 0, Params()); - auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); + + auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); ASSERT_TRUE(!!converter); ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); - auto holder = GetHolder(DIMENSION, COUNT, dist); - ASSERT_EQ(0u, IndexConverter::TrainAndTransform(converter, holder)); - auto holder2 = converter->result(); - EXPECT_EQ(COUNT, holder2->count()); - EXPECT_EQ(IndexMeta::DT_INT4, holder2->data_type()); - auto &meta2 = converter->meta(); - auto reformer = IndexFactory::CreateReformer(meta2.reformer_name()); - ASSERT_TRUE(reformer); - ASSERT_EQ(0u, reformer->init(meta2.reformer_params())); + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); - ailego::NumericalVector vec(DIMENSION); + ailego::NumericalVector query_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { - vec[j] = dist(gen); - } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta qmeta2; - std::string out; - ASSERT_EQ(0, reformer->transform(vec.data(), qmeta, &out, &qmeta2)); - ASSERT_EQ(qmeta2.dimension(), meta2.dimension()); - - auto iter = holder->create_iterator(); - auto iter2 = holder2->create_iterator(); - auto metric = IndexFactory::CreateMetric(meta2.metric_name()); - ASSERT_TRUE(!!metric); - ASSERT_EQ(0, metric->init(meta2, meta2.metric_params())); - auto compute = metric->distance(); - ASSERT_TRUE(compute); - - for (; iter->is_valid(); iter->next(), iter2->next()) { - const float *mf = (const float *)iter->data(); - const int8_t *mi = (const int8_t *)iter2->data(); - const int8_t *qi = reinterpret_cast(&out[0]); - float v1 = ailego::Distance::MinusInnerProduct(mf, vec.data(), - holder->dimension()); - float v2; - compute(mi, qi, holder2->dimension(), &v2); - ASSERT_NEAR(v1, v2, 0.2 * DIMENSION); - - std::string out2; - ASSERT_EQ(0, reformer->convert(iter->data(), qmeta, &out2, &qmeta2)); - ASSERT_EQ(out2.size(), holder2->element_size()); - ASSERT_EQ(0, std::memcmp(out2.data(), iter2->data(), out2.size())); + query_vec[j] = dist(gen); } -} -TEST(QuantizedIntegerMetric, TestInt8Cosine) { - std::mt19937 gen(15583); - std::uniform_real_distribution dist(-1.0, 2.0); + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } - const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); - const size_t COUNT = 1000; - IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); - meta.set_metric("Cosine", 0, Params()); - auto converter = IndexFactory::CreateConverter("CosineInt8Converter"); - ASSERT_TRUE(!!converter); - Params converter_params; - ASSERT_EQ(0u, converter->init(meta, converter_params)); + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; - auto holder = GetHolder(DIMENSION, COUNT, dist); - ASSERT_EQ(0u, IndexConverter::TrainAndTransform(converter, holder)); - auto holder2 = converter->result(); - EXPECT_EQ(COUNT, holder2->count()); - EXPECT_EQ(IndexMeta::DT_INT8, holder2->data_type()); - auto &meta2 = converter->meta(); + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - auto reformer = IndexFactory::CreateReformer(meta2.reformer_name()); - ASSERT_TRUE(reformer); - ASSERT_EQ(0u, reformer->init(meta2.reformer_params())); + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - ailego::NumericalVector vec(DIMENSION); - for (size_t j = 0; j < DIMENSION; ++j) { - vec[j] = dist(gen); - } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta qmeta2; - std::string out; - ASSERT_EQ(0, reformer->transform(vec.data(), qmeta, &out, &qmeta2)); - ASSERT_EQ(qmeta2.dimension(), meta2.dimension()); - - auto iter = holder->create_iterator(); - auto iter2 = holder2->create_iterator(); - auto metric = IndexFactory::CreateMetric(meta2.metric_name()); - ASSERT_TRUE(!!metric); - ASSERT_EQ(0, metric->init(meta2, meta2.metric_params())); - auto compute_batch = metric->batch_distance(); - ASSERT_TRUE(compute_batch); - - int8_t *qi = reinterpret_cast(&out[0]); - if (auto query_preprocess_func = metric->get_query_preprocess_func(); - query_preprocess_func != nullptr) { - query_preprocess_func(qi, holder2->dimension()); - } + float score_float = ailego::Distance::MinusInnerProduct( + query_vec.data(), doc_vec.data(), DIMENSION); + + float score_avx2{0.0f}; + float score_sse{0.0f}; - for (; iter->is_valid(); iter->next(), iter2->next()) { - const float *mf = (const float *)iter->data(); - const int8_t *mi = (const int8_t *)iter2->data(); - - // normalize mf & vec - std::vector normalized_mf(DIMENSION); - memcpy(normalized_mf.data(), mf, DIMENSION * sizeof(float)); - float norm_mf = 0.0; - ailego::Normalizer::L2((float *)normalized_mf.data(), DIMENSION, - &norm_mf); - std::vector normalized_vec(DIMENSION); - memcpy(normalized_vec.data(), vec.data(), DIMENSION * sizeof(float)); - float norm_vec = 0.0; - ailego::Normalizer::L2((float *)normalized_vec.data(), DIMENSION, - &norm_vec); - - float v1 = ailego::Distance::MinusInnerProduct( - normalized_mf.data(), normalized_vec.data(), holder->dimension()); - float v2; - compute_batch(reinterpret_cast(&mi), qi, 1, - holder2->dimension(), &v2); - // printf("%f %f\n", v1, v2); - ASSERT_NEAR(v1, v2, 0.2 * DIMENSION); - - std::string out2; - ASSERT_EQ(0, reformer->convert(iter->data(), qmeta, &out2, &qmeta2)); - ASSERT_EQ(out2.size(), holder2->element_size()); - ASSERT_EQ(0, std::memcmp(out2.data(), iter2->data(), out2.size())); + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_avx2, score_sse, 0.001); } } - -#endif \ No newline at end of file From 42dd2999e80f319021730649d4e5fbcfd94b2c78 Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 31 Mar 2026 14:45:36 +0800 Subject: [PATCH 11/44] feat: add scalar dist funcs --- src/turbo/avx/float32/cosine.cc | 2 +- src/turbo/avx/float32/inner_product.cc | 18 +++++------------- src/turbo/avx/float32/inner_product.h | 4 ++-- src/turbo/avx/float32/squared_euclidean.cc | 3 ++- src/turbo/scalar/float32/cosine.cc | 11 ++++++++++- src/turbo/scalar/float32/inner_product.cc | 12 +++++++++++- src/turbo/scalar/float32/squared_euclidean.cc | 13 ++++++++++++- src/turbo/turbo.cc | 9 +++++++++ 8 files changed, 52 insertions(+), 20 deletions(-) diff --git a/src/turbo/avx/float32/cosine.cc b/src/turbo/avx/float32/cosine.cc index 838e6f6ff..76791ad8a 100644 --- a/src/turbo/avx/float32/cosine.cc +++ b/src/turbo/avx/float32/cosine.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "avx/float32/cosine.h" -#include "avx/float32/inner_product_common.h" +#include "avx/float32/common.h" #if defined(__AVX__) #include diff --git a/src/turbo/avx/float32/inner_product.cc b/src/turbo/avx/float32/inner_product.cc index bf8d5290a..5e34f0bb6 100644 --- a/src/turbo/avx/float32/inner_product.cc +++ b/src/turbo/avx/float32/inner_product.cc @@ -12,42 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx2/record_quantized_int4/inner_product.h" -#include "avx2/record_quantized_int4/inner_product_common.h" +#include "avx/float32/inner_product.h" +#include "avx/float32/common.h" -#if defined(__AVX2__) +#if defined(__AVX__) #include #endif -namespace zvec::turbo::avx2 { +namespace zvec::turbo::avx { // Compute squared Euclidean distance between a single quantized FP32 // vector pair. void inner_product_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__AVX2__) - -#else (void)a; (void)b; (void)dim; (void)distance; -#endif //__AVX2__ } // Batch version of inner_product_fp32_distance. void inner_product_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__AVX2__) - -#else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; -#endif //__AVX2__ } -} // namespace zvec::turbo::avx2 \ No newline at end of file +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/float32/inner_product.h b/src/turbo/avx/float32/inner_product.h index a98659a26..083a35f6f 100644 --- a/src/turbo/avx/float32/inner_product.h +++ b/src/turbo/avx/float32/inner_product.h @@ -16,7 +16,7 @@ #include -namespace zvec::turbo::avx2 { +namespace zvec::turbo::avx { // Compute inner product distance between a single quantized FP32 // vector pair. @@ -28,4 +28,4 @@ void inner_product_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); -} // namespace zvec::turbo::avx2 +} // namespace zvec::turbo::avx diff --git a/src/turbo/avx/float32/squared_euclidean.cc b/src/turbo/avx/float32/squared_euclidean.cc index 3bd1937d1..710738d24 100644 --- a/src/turbo/avx/float32/squared_euclidean.cc +++ b/src/turbo/avx/float32/squared_euclidean.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "avx/float32/squared_euclidean.h" -#include "avx/float32/inner_product_common.h" +#include "avx/float32/common.h" #if defined(__AVX__) #include @@ -24,6 +24,7 @@ namespace zvec::turbo::avx { void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX__) + #else (void)a; (void)b; diff --git a/src/turbo/scalar/float32/cosine.cc b/src/turbo/scalar/float32/cosine.cc index f4d1db6e8..21c7938d7 100644 --- a/src/turbo/scalar/float32/cosine.cc +++ b/src/turbo/scalar/float32/cosine.cc @@ -13,11 +13,20 @@ // limitations under the License. #include "scalar/float32/cosine.h" +#include "scalar/float32/inner_product.h" namespace zvec::turbo::scalar { void cosine_fp32_distance(const void *a, const void *b, size_t dim, - float *distance) {} + float *distance) { + constexpr size_t extra_dim = 1; + size_t original_dim = dim - extra_dim; + + float ip; + inner_product_fp32_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; +} void cosine_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) {} diff --git a/src/turbo/scalar/float32/inner_product.cc b/src/turbo/scalar/float32/inner_product.cc index 5dd945b7a..65f63bb36 100644 --- a/src/turbo/scalar/float32/inner_product.cc +++ b/src/turbo/scalar/float32/inner_product.cc @@ -19,7 +19,17 @@ namespace zvec::turbo::scalar { // Compute squared Euclidean distance between a single quantized FP32 // vector pair. void inner_product_fp32_distance(const void *a, const void *b, size_t dim, - float *distance) {} + float *distance) { + const float *m = reinterpret_cast(a); + const float *q = reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + sum += static_cast(m[i] * q[i]); + } + + *distance = -sum; +} // Batch version of inner_product_fp32_distance. void inner_product_fp32_batch_distance(const void *const *vectors, diff --git a/src/turbo/scalar/float32/squared_euclidean.cc b/src/turbo/scalar/float32/squared_euclidean.cc index e89e01c18..f69c42e4d 100644 --- a/src/turbo/scalar/float32/squared_euclidean.cc +++ b/src/turbo/scalar/float32/squared_euclidean.cc @@ -13,11 +13,22 @@ // limitations under the License. #include "scalar/float32/squared_euclidean.h" +#include namespace zvec::turbo::scalar { void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, - float *distance) {} + float *distance) { + const float *m = reinterpret_cast(a); + const float *q = reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + sum += zvec::ailego::MathHelper::SquaredDifference(m[i], q[i]); + } + + *distance = sum; +} void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index 8bd3ac068..748b840d2 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -14,14 +14,23 @@ #include #include +#include "avx/float32/cosine.h" +#include "avx/float32/inner_product.h" +#include "avx/float32/squared_euclidean.h" #include "avx2/record_quantized_int4/cosine.h" #include "avx2/record_quantized_int4/inner_product.h" #include "avx2/record_quantized_int4/squared_euclidean.h" #include "avx2/record_quantized_int8/cosine.h" #include "avx2/record_quantized_int8/inner_product.h" #include "avx2/record_quantized_int8/squared_euclidean.h" +#include "avx512/float32/cosine.h" +#include "avx512/float32/inner_product.h" +#include "avx512/float32/squared_euclidean.h" #include "avx512_vnni/record_quantized_int8/cosine.h" #include "avx512_vnni/record_quantized_int8/squared_euclidean.h" +#include "scalar/float32/cosine.h" +#include "scalar/float32/inner_product.h" +#include "scalar/float32/squared_euclidean.h" #include "scalar/record_quantized_int4/cosine.h" #include "scalar/record_quantized_int4/inner_product.h" #include "scalar/record_quantized_int4/squared_euclidean.h" From 04d86ff0f417a9075644a260aed304cce8bd6b5f Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 31 Mar 2026 14:45:52 +0800 Subject: [PATCH 12/44] feat: add scalar dist funcs --- src/turbo/scalar/float16/cosine.cc | 34 +++++++++++++++ src/turbo/scalar/float16/cosine.h | 30 +++++++++++++ src/turbo/scalar/float16/inner_product.cc | 42 +++++++++++++++++++ src/turbo/scalar/float16/inner_product.h | 31 ++++++++++++++ src/turbo/scalar/float16/squared_euclidean.cc | 39 +++++++++++++++++ src/turbo/scalar/float16/squared_euclidean.h | 31 ++++++++++++++ 6 files changed, 207 insertions(+) create mode 100644 src/turbo/scalar/float16/cosine.cc create mode 100644 src/turbo/scalar/float16/cosine.h create mode 100644 src/turbo/scalar/float16/inner_product.cc create mode 100644 src/turbo/scalar/float16/inner_product.h create mode 100644 src/turbo/scalar/float16/squared_euclidean.cc create mode 100644 src/turbo/scalar/float16/squared_euclidean.h diff --git a/src/turbo/scalar/float16/cosine.cc b/src/turbo/scalar/float16/cosine.cc new file mode 100644 index 000000000..4999cc8c2 --- /dev/null +++ b/src/turbo/scalar/float16/cosine.cc @@ -0,0 +1,34 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/float16/cosine.h" +#include "scalar/float16/inner_product.h" + +namespace zvec::turbo::scalar { + +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { + constexpr size_t extra_dim = 2; + size_t original_dim = dim - extra_dim; + + float ip; + inner_product_fp16_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; +} + +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) {} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float16/cosine.h b/src/turbo/scalar/float16/cosine.h new file mode 100644 index 000000000..cb82bc893 --- /dev/null +++ b/src/turbo/scalar/float16/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP16 vector pair. +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp16_distance. +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float16/inner_product.cc b/src/turbo/scalar/float16/inner_product.cc new file mode 100644 index 000000000..e968a6c31 --- /dev/null +++ b/src/turbo/scalar/float16/inner_product.cc @@ -0,0 +1,42 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/float32/inner_product.h" +#include + +namespace zvec::turbo::scalar { + +// Compute squared Euclidean distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { + const zvec::ailego::Float16 *m = + reinterpret_cast(a); + const zvec::ailego::Float16 *q = + reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + sum += static_cast(m[i] * q[i]); + } + + *distance = -sum; +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) {} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float16/inner_product.h b/src/turbo/scalar/float16/inner_product.h new file mode 100644 index 000000000..98fc4cba4 --- /dev/null +++ b/src/turbo/scalar/float16/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute inner product distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::scalar diff --git a/src/turbo/scalar/float16/squared_euclidean.cc b/src/turbo/scalar/float16/squared_euclidean.cc new file mode 100644 index 000000000..53d46c0a1 --- /dev/null +++ b/src/turbo/scalar/float16/squared_euclidean.cc @@ -0,0 +1,39 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "scalar/float32/squared_euclidean.h" +#include + +namespace zvec::turbo::scalar { + +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { + const zvec::ailego::Float16 *m = + reinterpret_cast(a); + const zvec::ailego::Float16 *q = + reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + sum += zvec::ailego::MathHelper::SquaredDifference(m[i], q[i]); + } + + *distance = sum; +} + +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) {} + +} // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float16/squared_euclidean.h b/src/turbo/scalar/float16/squared_euclidean.h new file mode 100644 index 000000000..8865cd1c2 --- /dev/null +++ b/src/turbo/scalar/float16/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::scalar { + +// Compute squared euclidean distance between a single quantized FP16 +// vector pair. +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::scalar From 1958a828caeb7f4a04e3fa0713e3a2db359b9337 Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 31 Mar 2026 15:30:07 +0800 Subject: [PATCH 13/44] feat: add ut --- src/turbo/avx512/float32/inner_product.cc | 48 ++ .../scalar/record_quantized_int8/cosine.cc | 28 +- tests/turbo/turbo_cosine_test.cc | 608 ++++++++++++++++++ tests/turbo/turbo_euclidean_test.cc | 145 +++++ tests/turbo/turbo_inner_product_test.cc | 80 +++ ...ger_test.cc => turbo_quantized_integer.cc} | 12 +- 6 files changed, 911 insertions(+), 10 deletions(-) create mode 100644 tests/turbo/turbo_cosine_test.cc create mode 100644 tests/turbo/turbo_euclidean_test.cc create mode 100644 tests/turbo/turbo_inner_product_test.cc rename tests/turbo/{quantized_integer_test.cc => turbo_quantized_integer.cc} (94%) diff --git a/src/turbo/avx512/float32/inner_product.cc b/src/turbo/avx512/float32/inner_product.cc index f9086f11b..84264127a 100644 --- a/src/turbo/avx512/float32/inner_product.cc +++ b/src/turbo/avx512/float32/inner_product.cc @@ -26,6 +26,54 @@ namespace zvec::turbo::avx512 { void inner_product_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX512__) + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + size; + const float *last_aligned = lhs + ((size >> 5) << 5); + + __m512 zmm_sum_0 = _mm512_setzero_ps(); + __m512 zmm_sum_1 = _mm512_setzero_ps(); + + if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + FMA_FP32_AVX512(_mm512_load_ps(lhs + 0), _mm512_load_ps(rhs + 0), + zmm_sum_0) + + FMA_FP32_AVX512(_mm512_load_ps(lhs + 16), _mm512_load_ps(rhs + 16), + zmm_sum_1) + } + + if (last >= last_aligned + 16) { + FMA_FP32_AVX512(_mm512_load_ps(lhs), _mm512_load_ps(rhs), zmm_sum_0) + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + FMA_FP32_AVX512(_mm512_loadu_ps(lhs + 0), _mm512_loadu_ps(rhs + 0), + zmm_sum_0) + + FMA_FP32_AVX512(_mm512_loadu_ps(lhs + 16), _mm512_loadu_ps(rhs + 16), + zmm_sum_1) + } + + if (last >= last_aligned + 16) { + FMA_FP32_AVX512(_mm512_loadu_ps(lhs), _mm512_loadu_ps(rhs), zmm_sum_0) + lhs += 16; + rhs += 16; + } + } + + zmm_sum_0 = _mm512_add_ps(zmm_sum_0, zmm_sum_1); + if (lhs != last) { + __mmask16 mask = (__mmask16)((1 << (last - lhs)) - 1); + __m512 zmm_undefined = _mm512_undefined_ps(); + zmm_sum_0 = _mm512_mask3_fmadd_ps( + _mm512_mask_loadu_ps(zmm_undefined, mask, lhs), + _mm512_mask_loadu_ps(zmm_undefined, mask, rhs), zmm_sum_0, mask); + } + return HorizontalAdd_FP32_V512(zmm_sum_0); #else (void)a; diff --git a/src/turbo/scalar/record_quantized_int8/cosine.cc b/src/turbo/scalar/record_quantized_int8/cosine.cc index 221068437..c42e0b7b1 100644 --- a/src/turbo/scalar/record_quantized_int8/cosine.cc +++ b/src/turbo/scalar/record_quantized_int8/cosine.cc @@ -13,16 +13,36 @@ // limitations under the License. #include "scalar/record_quantized_int8/cosine.h" +#include #include "scalar/record_quantized_int8/common.h" namespace zvec::turbo::scalar { void cosine_int8_distance(const void *a, const void *b, size_t dim, float *distance) { - (void)a; - (void)b; - (void)dim; - (void)distance; + const size_t original_dim = dim - 20; + + if (original_dim <= 0) { + return; + } + + // internal::inner_product_int8_scalar(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + original_dim * qb * mb); } void cosine_int8_batch_distance(const void *const *vectors, const void *query, diff --git a/tests/turbo/turbo_cosine_test.cc b/tests/turbo/turbo_cosine_test.cc new file mode 100644 index 000000000..ce7ce94d0 --- /dev/null +++ b/tests/turbo/turbo_cosine_test.cc @@ -0,0 +1,608 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include +#include "zvec/core/framework/index_factory.h" + +using namespace zvec; +using namespace zvec::core; +using namespace zvec::ailego; + +#if 0 +static void Norm2(std::vector &vec, std::string *out) { + float norm = 0.0f; + + out->resize(vec.size() * sizeof(Float16) + sizeof(float)); + + Norm2Matrix::Compute(vec.data(), vec.size(), &norm); + + Float16 *buf = reinterpret_cast(&(*out)[0]); + + for (uint32_t i = 0; i < vec.size(); ++i) { + buf[i] = vec[i] / norm; + } + + float *norm_buf = + reinterpret_cast(&(*out)[vec.size() * sizeof(Float16)]); + + memcpy(norm_buf, &norm, sizeof(float)); +} + +static void Norm2(std::vector &vec, std::string *out) { + float norm = 0.0f; + + out->resize((vec.size() + 1) * sizeof(float)); + + Norm2Matrix::Compute(vec.data(), vec.size(), &norm); + + float *buf = reinterpret_cast(&(*out)[0]); + for (uint32_t i = 0; i < vec.size(); ++i) { + buf[i] = vec[i] / norm; + } + + buf[vec.size()] = norm; +} + +static size_t ExtraDimension(IndexMeta::DataType type) { + // The extra quantized params storage size to save for each vector + if (type == IndexMeta::DT_FP32) return 1; + if (type == IndexMeta::DT_FP16) return 2; + + return 0; +} + +TEST(CosineMeasure_General_Test, General) { + auto measure = IndexFactory::CreateMetric("Cosine"); + EXPECT_TRUE(measure); + + IndexMeta meta; + meta.set_meta(IndexMeta::DT_INT16, 64); + ASSERT_NE(0, measure->init(meta, Params())); + meta.set_meta(IndexMeta::DT_FP16, 64); + ASSERT_EQ(0, measure->init(meta, Params())); + meta.set_meta(IndexMeta::DT_FP32, 64); + ASSERT_EQ(0, measure->init(meta, Params())); + meta.set_meta(IndexMeta::DT_INT8, 64); + ASSERT_NE(0, measure->init(meta, Params())); + + meta.set_meta(IndexMeta::DT_BINARY32, 64); + ASSERT_NE(0, measure->init(meta, Params())); + meta.set_meta(IndexMeta::DT_BINARY64, 64); + ASSERT_NE(0, measure->init(meta, Params())); + meta.set_meta(IndexMeta::DT_INT4, 64); + ASSERT_NE(0, measure->init(meta, Params())); + + IndexMeta meta2; + meta2.set_meta(IndexMeta::DT_BINARY32, 64); + EXPECT_FALSE(measure->is_matched(meta2)); + EXPECT_TRUE( + measure->is_matched(meta, IndexQueryMeta(IndexMeta::DT_FP32, 64))); + EXPECT_FALSE( + measure->is_matched(meta, IndexQueryMeta(IndexMeta::DT_FP32, 63))); + + EXPECT_FALSE(measure->distance_matrix(0, 0)); + EXPECT_FALSE(measure->distance_matrix(3, 5)); + EXPECT_FALSE(measure->distance_matrix(31, 65)); + EXPECT_TRUE(measure->distance_matrix(1, 1)); + EXPECT_FALSE(measure->distance_matrix(2, 1)); + EXPECT_FALSE(measure->distance_matrix(2, 2)); + EXPECT_FALSE(measure->distance_matrix(4, 1)); + EXPECT_FALSE(measure->distance_matrix(4, 2)); + EXPECT_FALSE(measure->distance_matrix(4, 4)); + EXPECT_FALSE(measure->distance_matrix(8, 1)); + EXPECT_FALSE(measure->distance_matrix(8, 2)); + EXPECT_FALSE(measure->distance_matrix(8, 4)); + EXPECT_FALSE(measure->distance_matrix(8, 8)); + EXPECT_FALSE(measure->distance_matrix(16, 1)); + EXPECT_FALSE(measure->distance_matrix(16, 2)); + EXPECT_FALSE(measure->distance_matrix(16, 4)); + EXPECT_FALSE(measure->distance_matrix(16, 8)); + EXPECT_FALSE(measure->distance_matrix(16, 16)); + EXPECT_FALSE(measure->distance_matrix(32, 1)); + EXPECT_FALSE(measure->distance_matrix(32, 2)); + EXPECT_FALSE(measure->distance_matrix(32, 4)); + EXPECT_FALSE(measure->distance_matrix(32, 8)); + EXPECT_FALSE(measure->distance_matrix(32, 16)); + EXPECT_FALSE(measure->distance_matrix(32, 32)); + + EXPECT_FALSE(measure->support_normalize()); + float result = 1.0f; + measure->normalize(&result); + EXPECT_FLOAT_EQ(1.0f, result); +} + +TEST(CosineMeasure_General_Test, TestDistanceFp32) { + { + constexpr uint32_t dimension = 2; + IndexMeta meta; + meta.set_meta(IndexMeta::DT_FP32, dimension); + + auto measure = IndexFactory::CreateMetric("Cosine"); + ASSERT_TRUE(measure); + Params params; + ASSERT_EQ(0, measure->init(meta, params)); + ASSERT_EQ(false, measure->support_train()); + + auto distance = measure->distance(); + ASSERT_NE(distance, nullptr); + auto dist_matrix = measure->distance_matrix(1, 1); + ASSERT_NE(dist_matrix, nullptr); + + std::vector a = {0.2f, 0.9f}; + std::vector b = {0.3f, 0.5f}; + + std::string a_out; + std::string b_out; + + Norm2(a, &a_out); + Norm2(b, &b_out); + + float result = 0.0f; + distance(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP32), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.00001f, std::abs(result - 0.05131668f)); + + dist_matrix(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP32), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.00001f, std::abs(result - 0.05131668f)); + } + + { + constexpr uint32_t dimension = 3; + IndexMeta meta; + meta.set_meta(IndexMeta::DT_FP32, dimension); + + auto measure = IndexFactory::CreateMetric("Cosine"); + ASSERT_TRUE(measure); + Params params; + ASSERT_EQ(0, measure->init(meta, params)); + ASSERT_EQ(false, measure->support_train()); + + auto distance = measure->distance(); + ASSERT_NE(distance, nullptr); + auto dist_matrix = measure->distance_matrix(1, 1); + ASSERT_NE(dist_matrix, nullptr); + + std::vector a = {0.2f, 0.9f, 0.6f}; + std::vector b = {0.3f, 0.5f, 0.7f}; + + std::string a_out; + std::string b_out; + + Norm2(a, &a_out); + Norm2(b, &b_out); + + float result = 0.0f; + distance(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP32), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.00001f, std::abs(result - 0.07199293f)); + + dist_matrix(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP32), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.00001f, std::abs(result - 0.07199293f)); + } + + { + constexpr uint32_t dimension = 11; + IndexMeta meta; + meta.set_meta(IndexMeta::DT_FP32, dimension); + + auto measure = IndexFactory::CreateMetric("Cosine"); + ASSERT_TRUE(measure); + Params params; + ASSERT_EQ(0, measure->init(meta, params)); + ASSERT_EQ(false, measure->support_train()); + + auto distance = measure->distance(); + ASSERT_NE(distance, nullptr); + auto dist_matrix = measure->distance_matrix(1, 1); + ASSERT_NE(dist_matrix, nullptr); + + std::vector a = {1.0f, 2.0f, 3.0f, 0.2f, 0.3f, 0.1f, + 5.2f, 2.1f, 7.1f, 6.8f, 1.2f}; + std::vector b = {2.0f, 4.0f, 6.0f, 0.6f, 0.7f, 0.9f, + 1.0f, 2.3f, 3.4f, 4.5f, 6.4f}; + + + std::string a_out; + std::string b_out; + + Norm2(a, &a_out); + Norm2(b, &b_out); + + float result = 0.0f; + distance(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP32), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.00001f, std::abs(result - 0.2803060f)); + + dist_matrix(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP32), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.00001f, std::abs(result - 0.2803060f)); + } +} + +TEST(CosineMeasure_General_Test, TestDistanceFp16) { + { + constexpr uint32_t dimension = 2; + IndexMeta meta; + meta.set_meta(IndexMeta::DT_FP16, dimension); + + auto measure = IndexFactory::CreateMetric("Cosine"); + ASSERT_TRUE(measure); + Params params; + ASSERT_EQ(0, measure->init(meta, params)); + ASSERT_EQ(false, measure->support_train()); + + auto distance = measure->distance(); + ASSERT_NE(distance, nullptr); + auto dist_matrix = measure->distance_matrix(1, 1); + ASSERT_NE(dist_matrix, nullptr); + + std::vector a = {0.2f, 0.9f}; + std::vector b = {0.3f, 0.5f}; + + std::string a_out; + std::string b_out; + + Norm2(a, &a_out); + Norm2(b, &b_out); + + float result = 0.0f; + distance(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP16), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.001f, std::abs(result - 0.05131668f)); + + dist_matrix(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP16), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.001f, std::abs(result - 0.05131668f)); + } + + { + constexpr uint32_t dimension = 3; + IndexMeta meta; + meta.set_meta(IndexMeta::DT_FP16, dimension); + + auto measure = IndexFactory::CreateMetric("Cosine"); + ASSERT_TRUE(measure); + Params params; + ASSERT_EQ(0, measure->init(meta, params)); + ASSERT_EQ(false, measure->support_train()); + + auto distance = measure->distance(); + ASSERT_NE(distance, nullptr); + auto dist_matrix = measure->distance_matrix(1, 1); + ASSERT_NE(dist_matrix, nullptr); + + std::vector a = {0.2f, 0.9f, 0.6f}; + std::vector b = {0.3f, 0.5f, 0.7f}; + + std::string a_out; + std::string b_out; + + Norm2(a, &a_out); + Norm2(b, &b_out); + + float result = 0.0f; + distance(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP16), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.001f, std::abs(result - 0.07199293f)); + + dist_matrix(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP16), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.001f, std::abs(result - 0.07199293f)); + } + + { + constexpr uint32_t dimension = 11; + IndexMeta meta; + meta.set_meta(IndexMeta::DT_FP16, dimension); + + auto measure = IndexFactory::CreateMetric("Cosine"); + ASSERT_TRUE(measure); + Params params; + ASSERT_EQ(0, measure->init(meta, params)); + ASSERT_EQ(false, measure->support_train()); + + auto distance = measure->distance(); + ASSERT_NE(distance, nullptr); + auto dist_matrix = measure->distance_matrix(1, 1); + ASSERT_NE(dist_matrix, nullptr); + + std::vector a = {1.0f, 2.0f, 3.0f, 0.2f, 0.3f, 0.1f, + 5.2f, 2.1f, 7.1f, 6.8f, 1.2f}; + std::vector b = {2.0f, 4.0f, 6.0f, 0.6f, 0.7f, 0.9f, + 1.0f, 2.3f, 3.4f, 4.5f, 6.4f}; + + std::string a_out; + std::string b_out; + + Norm2(a, &a_out); + Norm2(b, &b_out); + + float result = 0.0f; + dist_matrix(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP16), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.001f, std::abs(result - 0.2803060f)); + + dist_matrix(a_out.data(), b_out.data(), + dimension + ExtraDimension(IndexMeta::DT_FP16), &result); + + if (measure->support_normalize()) { + measure->normalize(&result); + } + + EXPECT_GE(0.001f, std::abs(result - 0.2803060f)); + } +} + +TEST(CosineMeasure_General_Test, TestDistanceBatchFp16Simple) { + { + constexpr uint32_t dimension = 2; + IndexMeta meta; + meta.set_meta(IndexMeta::DT_FP16, dimension); + + auto measure = IndexFactory::CreateMetric("Cosine"); + ASSERT_TRUE(measure); + Params params; + ASSERT_EQ(0, measure->init(meta, params)); + ASSERT_EQ(false, measure->support_train()); + + auto dist_batch = measure->batch_distance(); + ASSERT_NE(dist_batch, nullptr); + + std::vector a = {0.2f, 0.9f}; + std::vector b = {0.3f, 0.5f}; + + std::string a_out; + std::string b_out; + + + Norm2(a, &a_out); + Norm2(b, &b_out); + + float results[2] = {0.0f, 0.0f}; + + const void *vecs[2]; + vecs[0] = a_out.data(); + vecs[1] = b_out.data(); + dist_batch(vecs, b_out.data(), 2, + dimension + ExtraDimension(IndexMeta::DT_FP16), results); + + if (measure->support_normalize()) { + measure->normalize(&results[0]); + measure->normalize(&results[1]); + } + + EXPECT_GE(0.001f, std::abs(results[0] - 0.05131668f)); + EXPECT_GE(0.001f, std::abs(results[1] - 0.0f)); + } +} + +TEST(CosineMeasure_General_Test, TestDistanceBatchFp32Simple) { + { + constexpr uint32_t dimension = 2; + IndexMeta meta; + meta.set_meta(IndexMeta::DT_FP32, dimension); + + auto measure = IndexFactory::CreateMetric("Cosine"); + ASSERT_TRUE(measure); + Params params; + ASSERT_EQ(0, measure->init(meta, params)); + ASSERT_EQ(false, measure->support_train()); + + auto dist_batch = measure->batch_distance(); + ASSERT_NE(dist_batch, nullptr); + + std::vector a = {0.2f, 0.9f}; + std::vector b = {0.3f, 0.5f}; + + std::string a_out; + std::string b_out; + + Norm2(a, &a_out); + Norm2(b, &b_out); + + float results[2] = {0.0f, 0.0f}; + + const void *vecs[2]; + vecs[0] = a_out.data(); + vecs[1] = b_out.data(); + dist_batch(vecs, b_out.data(), 2, + dimension + ExtraDimension(IndexMeta::DT_FP32), results); + + if (measure->support_normalize()) { + measure->normalize(&results[0]); + measure->normalize(&results[1]); + } + + EXPECT_GE(0.00001f, std::abs(results[0] - 0.05131668f)); + EXPECT_GE(0.00001f, std::abs(results[1] - 0.0f)); + } +} + +template +void calculate_distance(std::vector &a, std::vector &b, size_t dimension, + IndexMeta::DataType data_type, size_t batch_size, + float expected_distance, float epsilon = 0.00001f) { + IndexMeta meta; + meta.set_meta(data_type, dimension); + + auto measure = IndexFactory::CreateMetric("Cosine"); + ASSERT_TRUE(measure); + Params params; + ASSERT_EQ(0, measure->init(meta, params)); + ASSERT_EQ(false, measure->support_train()); + + auto dist_batch = measure->batch_distance(); + ASSERT_NE(dist_batch, nullptr); + + std::string a_out; + std::string b_out; + + Norm2(a, &a_out); + Norm2(b, &b_out); + + float results[2] = {0.0f, 0.0f}; + + const void *vecs[2]; + vecs[0] = a_out.data(); + vecs[1] = b_out.data(); + dist_batch(vecs, b_out.data(), batch_size, + dimension + ExtraDimension(data_type), results); + + if (measure->support_normalize()) { + measure->normalize(&results[0]); + measure->normalize(&results[1]); + } + + EXPECT_GE(epsilon, std::abs(results[0] - expected_distance)); + EXPECT_GE(epsilon, std::abs(results[1] - 0.0f)); +} + + +TEST(CosineMeasure_General_Test, TestDistanceBatch) { + { + constexpr uint32_t dimension = 2; + + { + std::vector a = {0.2f, 0.9f}; + std::vector b = {0.3f, 0.5f}; + + calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 1, 0.05131668f, + 0.00001f); + calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 2, 0.05131668f, + 0.00001f); + } + { + std::vector a = {0.2f, 0.9f}; + std::vector b = {0.3f, 0.5f}; + + calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 1, 0.05131668f, + 0.001f); + calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 2, 0.05131668f, + 0.001f); + } + } + + { + constexpr uint32_t dimension = 3; + + + { + std::vector a = {0.2f, 0.9f, 0.6f}; + std::vector b = {0.3f, 0.5f, 0.7f}; + + calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 1, 0.07199293f, + 0.00001f); + calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 2, 0.07199293f, + 0.00001f); + } + { + std::vector a = {0.2f, 0.9f, 0.6f}; + std::vector b = {0.3f, 0.5f, 0.7f}; + + calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 1, 0.07199293f, + 0.001f); + calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 2, 0.07199293f, + 0.001f); + } + } + + { + constexpr uint32_t dimension = 11; + + { + std::vector a = {1.0f, 2.0f, 3.0f, 0.2f, 0.3f, 0.1f, + 5.2f, 2.1f, 7.1f, 6.8f, 1.2f}; + std::vector b = {2.0f, 4.0f, 6.0f, 0.6f, 0.7f, 0.9f, + 1.0f, 2.3f, 3.4f, 4.5f, 6.4f}; + + calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 1, 0.2803060f, + 0.00001f); + calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 2, 0.2803060f, + 0.00001f); + } + + { + std::vector a = {1.0f, 2.0f, 3.0f, 0.2f, 0.3f, 0.1f, + 5.2f, 2.1f, 7.1f, 6.8f, 1.2f}; + std::vector b = {2.0f, 4.0f, 6.0f, 0.6f, 0.7f, 0.9f, + 1.0f, 2.3f, 3.4f, 4.5f, 6.4f}; + + calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 1, 0.2803060f, + 0.001f); + calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 2, 0.2803060f, + 0.001f); + } + } +} + +#endif \ No newline at end of file diff --git a/tests/turbo/turbo_euclidean_test.cc b/tests/turbo/turbo_euclidean_test.cc new file mode 100644 index 000000000..644ee46d0 --- /dev/null +++ b/tests/turbo/turbo_euclidean_test.cc @@ -0,0 +1,145 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include "zvec/core/framework/index_factory.h" + +using namespace zvec; +using namespace zvec::core; + +#if 0 +TEST(SquaredEuclideanMetric, General) { + auto metric = IndexFactory::CreateMetric("SquaredEuclidean"); + EXPECT_TRUE(metric); + + IndexMeta meta; + meta.set_meta(IndexMeta::DataType::DT_INT16, 64); + ASSERT_NE(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_BINARY32, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_BINARY64, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_FP16, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_FP32, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_INT4, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_INT8, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + + IndexMeta meta2; + meta2.set_meta(IndexMeta::DataType::DT_BINARY32, 64); + EXPECT_TRUE(metric->is_matched(meta)); + EXPECT_FALSE(metric->is_matched(meta2)); + EXPECT_TRUE(metric->is_matched( + meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 64))); + EXPECT_FALSE(metric->is_matched( + meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 63))); + + EXPECT_FALSE(metric->distance_matrix(0, 0)); + EXPECT_FALSE(metric->distance_matrix(3, 5)); + EXPECT_FALSE(metric->distance_matrix(31, 65)); + EXPECT_TRUE(metric->distance_matrix(1, 1)); + EXPECT_TRUE(metric->distance_matrix(2, 1)); + EXPECT_TRUE(metric->distance_matrix(2, 2)); + EXPECT_TRUE(metric->distance_matrix(4, 1)); + EXPECT_TRUE(metric->distance_matrix(4, 2)); + EXPECT_TRUE(metric->distance_matrix(4, 4)); + EXPECT_TRUE(metric->distance_matrix(8, 1)); + EXPECT_TRUE(metric->distance_matrix(8, 2)); + EXPECT_TRUE(metric->distance_matrix(8, 4)); + EXPECT_TRUE(metric->distance_matrix(8, 8)); + EXPECT_FALSE(metric->distance_matrix(8, 32)); + EXPECT_FALSE(metric->distance_matrix(8, 9)); + EXPECT_TRUE(metric->distance_matrix(16, 1)); + EXPECT_TRUE(metric->distance_matrix(16, 2)); + EXPECT_TRUE(metric->distance_matrix(16, 4)); + EXPECT_TRUE(metric->distance_matrix(16, 8)); + EXPECT_TRUE(metric->distance_matrix(16, 16)); + EXPECT_FALSE(metric->distance_matrix(16, 17)); + EXPECT_TRUE(metric->distance_matrix(32, 1)); + EXPECT_TRUE(metric->distance_matrix(32, 2)); + EXPECT_TRUE(metric->distance_matrix(32, 4)); + EXPECT_TRUE(metric->distance_matrix(32, 8)); + EXPECT_TRUE(metric->distance_matrix(32, 16)); + EXPECT_TRUE(metric->distance_matrix(32, 32)); + + EXPECT_FALSE(metric->support_normalize()); + float result = 1.0f; + metric->normalize(&result); + EXPECT_FLOAT_EQ(1.0f, result); +} + +TEST(EuclideanMetric, General) { + auto metric = IndexFactory::CreateMetric("Euclidean"); + EXPECT_TRUE(metric); + + IndexMeta meta; + meta.set_meta(IndexMeta::DataType::DT_INT16, 64); + ASSERT_NE(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_BINARY32, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_BINARY64, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_FP16, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_FP32, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_INT4, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_INT8, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + + IndexMeta meta2; + meta2.set_meta(IndexMeta::DataType::DT_BINARY32, 64); + EXPECT_TRUE(metric->is_matched(meta)); + EXPECT_FALSE(metric->is_matched(meta2)); + EXPECT_TRUE(metric->is_matched( + meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 64))); + EXPECT_FALSE(metric->is_matched( + meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 63))); + + EXPECT_FALSE(metric->distance_matrix(0, 0)); + EXPECT_FALSE(metric->distance_matrix(3, 5)); + EXPECT_FALSE(metric->distance_matrix(31, 65)); + EXPECT_TRUE(metric->distance_matrix(1, 1)); + EXPECT_TRUE(metric->distance_matrix(2, 1)); + EXPECT_TRUE(metric->distance_matrix(2, 2)); + EXPECT_TRUE(metric->distance_matrix(4, 1)); + EXPECT_TRUE(metric->distance_matrix(4, 2)); + EXPECT_TRUE(metric->distance_matrix(4, 4)); + EXPECT_TRUE(metric->distance_matrix(8, 1)); + EXPECT_TRUE(metric->distance_matrix(8, 2)); + EXPECT_TRUE(metric->distance_matrix(8, 4)); + EXPECT_TRUE(metric->distance_matrix(8, 8)); + EXPECT_TRUE(metric->distance_matrix(16, 1)); + EXPECT_TRUE(metric->distance_matrix(16, 2)); + EXPECT_TRUE(metric->distance_matrix(16, 4)); + EXPECT_TRUE(metric->distance_matrix(16, 8)); + EXPECT_TRUE(metric->distance_matrix(16, 16)); + EXPECT_TRUE(metric->distance_matrix(32, 1)); + EXPECT_TRUE(metric->distance_matrix(32, 2)); + EXPECT_TRUE(metric->distance_matrix(32, 4)); + EXPECT_TRUE(metric->distance_matrix(32, 8)); + EXPECT_TRUE(metric->distance_matrix(32, 16)); + EXPECT_TRUE(metric->distance_matrix(32, 32)); + + EXPECT_FALSE(metric->support_normalize()); + float result = 1.0f; + metric->normalize(&result); + EXPECT_FLOAT_EQ(1.0f, result); +} + +#endif \ No newline at end of file diff --git a/tests/turbo/turbo_inner_product_test.cc b/tests/turbo/turbo_inner_product_test.cc new file mode 100644 index 000000000..0ec1b567e --- /dev/null +++ b/tests/turbo/turbo_inner_product_test.cc @@ -0,0 +1,80 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include "zvec/core/framework/index_factory.h" + +using namespace zvec; +using namespace zvec::core; + +#if 0 +TEST(InnerProductMetric, General) { + auto metric = IndexFactory::CreateMetric("InnerProduct"); + ASSERT_TRUE(metric); + + IndexMeta meta; + meta.set_meta(IndexMeta::DataType::DT_BINARY32, 64); + ASSERT_NE(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_BINARY64, 64); + ASSERT_NE(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_FP16, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_FP32, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_INT4, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + meta.set_meta(IndexMeta::DataType::DT_INT8, 64); + ASSERT_EQ(0, metric->init(meta, ailego::Params())); + + IndexMeta meta2; + meta2.set_meta(IndexMeta::DataType::DT_BINARY32, 64); + EXPECT_TRUE(metric->is_matched(meta)); + EXPECT_FALSE(metric->is_matched(meta2)); + EXPECT_TRUE(metric->is_matched( + meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 64))); + EXPECT_FALSE(metric->is_matched( + meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 63))); + + EXPECT_FALSE(metric->distance_matrix(0, 0)); + EXPECT_FALSE(metric->distance_matrix(3, 5)); + EXPECT_FALSE(metric->distance_matrix(31, 65)); + EXPECT_TRUE(metric->distance_matrix(1, 1)); + EXPECT_TRUE(metric->distance_matrix(2, 1)); + EXPECT_TRUE(metric->distance_matrix(2, 2)); + EXPECT_TRUE(metric->distance_matrix(4, 1)); + EXPECT_TRUE(metric->distance_matrix(4, 2)); + EXPECT_TRUE(metric->distance_matrix(4, 4)); + EXPECT_TRUE(metric->distance_matrix(8, 1)); + EXPECT_TRUE(metric->distance_matrix(8, 2)); + EXPECT_TRUE(metric->distance_matrix(8, 4)); + EXPECT_TRUE(metric->distance_matrix(8, 8)); + EXPECT_TRUE(metric->distance_matrix(16, 1)); + EXPECT_TRUE(metric->distance_matrix(16, 2)); + EXPECT_TRUE(metric->distance_matrix(16, 4)); + EXPECT_TRUE(metric->distance_matrix(16, 8)); + EXPECT_TRUE(metric->distance_matrix(16, 16)); + EXPECT_TRUE(metric->distance_matrix(32, 1)); + EXPECT_TRUE(metric->distance_matrix(32, 2)); + EXPECT_TRUE(metric->distance_matrix(32, 4)); + EXPECT_TRUE(metric->distance_matrix(32, 8)); + EXPECT_TRUE(metric->distance_matrix(32, 16)); + EXPECT_TRUE(metric->distance_matrix(32, 32)); + + EXPECT_TRUE(metric->support_normalize()); + float result = 1.0f; + metric->normalize(&result); + EXPECT_FLOAT_EQ(-1.0f, result); +} + +#endif \ No newline at end of file diff --git a/tests/turbo/quantized_integer_test.cc b/tests/turbo/turbo_quantized_integer.cc similarity index 94% rename from tests/turbo/quantized_integer_test.cc rename to tests/turbo/turbo_quantized_integer.cc index 94167557c..ef12b5fa4 100644 --- a/tests/turbo/quantized_integer_test.cc +++ b/tests/turbo/turbo_quantized_integer.cc @@ -40,7 +40,7 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { auto &convert_meta = converter->meta(); auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); - auto func_float = turbo::get_distance_func( + auto func_float32 = turbo::get_distance_func( turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); @@ -81,10 +81,10 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { &qmeta_reformer)); ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - float score_float = ailego::Distance::MinusInnerProduct( + float score_float32 = ailego::Distance::MinusInnerProduct( query_vec.data(), doc_vec.data(), DIMENSION); - func_float(query_vec.data(), doc_vec.data(), DIMENSION, &score_float); + func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); float score_scalar{0.0f}; float score_avx2{0.0f}; @@ -99,9 +99,9 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_sse); - ASSERT_NEAR(score_float, score_avx2, 0.2 * DIMENSION); - ASSERT_NEAR(score_float, score_sse, 0.2 * DIMENSION); - ASSERT_NEAR(score_float, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); ASSERT_NEAR(score_scalar, score_avx2, 0.001); ASSERT_NEAR(score_scalar, score_sse, 0.001); } From 92340b946dbc0ab8943bc81479b7f15ac7ed0634 Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 31 Mar 2026 16:54:14 +0800 Subject: [PATCH 14/44] feat: add dist funcs --- src/turbo/avx512/float32/common.h | 27 ++++++++ src/turbo/avx512/float32/inner_product.cc | 15 +++-- src/turbo/avx512/float32/squared_euclidean.cc | 64 +++++++++++++++++-- .../scalar/record_quantized_int4/common.h | 24 +++++++ .../record_quantized_int4/inner_product.cc | 17 +++-- .../scalar/record_quantized_int8/common.h | 19 ++++++ .../scalar/record_quantized_int8/cosine.cc | 4 +- .../record_quantized_int8/inner_product.cc | 28 ++++++-- ...ger.cc => turbo_quantized_integer_test.cc} | 8 +-- 9 files changed, 180 insertions(+), 26 deletions(-) rename tests/turbo/{turbo_quantized_integer.cc => turbo_quantized_integer_test.cc} (98%) diff --git a/src/turbo/avx512/float32/common.h b/src/turbo/avx512/float32/common.h index 13be3a2bf..36111ab18 100644 --- a/src/turbo/avx512/float32/common.h +++ b/src/turbo/avx512/float32/common.h @@ -21,3 +21,30 @@ // overhead. #pragma once + +#if defined(__AVX512F__) +#include +#include +#include + +//! Calculate Fused-Multiply-Add (AVX512) +#define FMA_FP32_AVX512(zmm_m, zmm_q, zmm_sum) \ + zmm_sum = _mm512_fmadd_ps(zmm_m, zmm_q, zmm_sum); + + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +static inline float HorizontalAdd_FP32_V512(__m512 v) { + __m256 low = _mm512_castps512_ps256(v); + __m256 high = + _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)); + return HorizontalAdd_FP32_V256(_mm256_add_ps(low, high)); +} + +#endif // __AVX512F__ \ No newline at end of file diff --git a/src/turbo/avx512/float32/inner_product.cc b/src/turbo/avx512/float32/inner_product.cc index 84264127a..0055d5911 100644 --- a/src/turbo/avx512/float32/inner_product.cc +++ b/src/turbo/avx512/float32/inner_product.cc @@ -15,7 +15,7 @@ #include "avx512/float32/inner_product.h" #include "avx512/float32/common.h" -#if defined(__AVX2__) +#if defined(__AVX512F__) #include #endif @@ -25,12 +25,12 @@ namespace zvec::turbo::avx512 { // vector pair. void inner_product_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__AVX512__) +#if defined(__AVX512F__) const float *lhs = reinterpret_cast(a); const float *rhs = reinterpret_cast(b); - const float *last = lhs + size; - const float *last_aligned = lhs + ((size >> 5) << 5); + const float *last = lhs + dim; + const float *last_aligned = lhs + ((dim >> 5) << 5); __m512 zmm_sum_0 = _mm512_setzero_ps(); __m512 zmm_sum_1 = _mm512_setzero_ps(); @@ -73,21 +73,22 @@ void inner_product_fp32_distance(const void *a, const void *b, size_t dim, _mm512_mask_loadu_ps(zmm_undefined, mask, lhs), _mm512_mask_loadu_ps(zmm_undefined, mask, rhs), zmm_sum_0, mask); } - return HorizontalAdd_FP32_V512(zmm_sum_0); + + *distance = -1 * HorizontalAdd_FP32_V512(zmm_sum_0); #else (void)a; (void)b; (void)dim; (void)distance; -#endif //__AVX2__ +#endif //__AVX512F__ } // Batch version of inner_product_fp32_distance. void inner_product_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__AVX512__) +#if defined(__AVX512F__) #else (void)vectors; diff --git a/src/turbo/avx512/float32/squared_euclidean.cc b/src/turbo/avx512/float32/squared_euclidean.cc index 9a21ced80..8f492e0fb 100644 --- a/src/turbo/avx512/float32/squared_euclidean.cc +++ b/src/turbo/avx512/float32/squared_euclidean.cc @@ -15,7 +15,7 @@ #include "avx512/float32/squared_euclidean.h" #include "avx512/float32/common.h" -#if defined(__AVX512__) +#if defined(__AVX512F__) #include #endif @@ -23,26 +23,80 @@ namespace zvec::turbo::avx512 { void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__AVX512__) +#if defined(__AVX512F__) + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + dim; + const float *last_aligned = lhs + ((dim >> 5) << 5); + + __m512 zmm_sum_0 = _mm512_setzero_ps(); + __m512 zmm_sum_1 = _mm512_setzero_ps(); + + if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + __m512 zmm_d_0 = + _mm512_sub_ps(_mm512_load_ps(lhs + 0), _mm512_load_ps(rhs + 0)); + __m512 zmm_d_1 = + _mm512_sub_ps(_mm512_load_ps(lhs + 16), _mm512_load_ps(rhs + 16)); + zmm_sum_0 = _mm512_fmadd_ps(zmm_d_0, zmm_d_0, zmm_sum_0); + zmm_sum_1 = _mm512_fmadd_ps(zmm_d_1, zmm_d_1, zmm_sum_1); + } + + if (last >= last_aligned + 16) { + __m512 zmm_d = _mm512_sub_ps(_mm512_load_ps(lhs), _mm512_load_ps(rhs)); + zmm_sum_0 = _mm512_fmadd_ps(zmm_d, zmm_d, zmm_sum_0); + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 32, rhs += 32) { + __m512 zmm_d_0 = + _mm512_sub_ps(_mm512_loadu_ps(lhs + 0), _mm512_loadu_ps(rhs + 0)); + __m512 zmm_d_1 = + _mm512_sub_ps(_mm512_loadu_ps(lhs + 16), _mm512_loadu_ps(rhs + 16)); + zmm_sum_0 = _mm512_fmadd_ps(zmm_d_0, zmm_d_0, zmm_sum_0); + zmm_sum_1 = _mm512_fmadd_ps(zmm_d_1, zmm_d_1, zmm_sum_1); + } + + if (last >= last_aligned + 16) { + __m512 zmm_d = _mm512_sub_ps(_mm512_loadu_ps(lhs), _mm512_loadu_ps(rhs)); + zmm_sum_0 = _mm512_fmadd_ps(zmm_d, zmm_d, zmm_sum_0); + lhs += 16; + rhs += 16; + } + } + + zmm_sum_0 = _mm512_add_ps(zmm_sum_0, zmm_sum_1); + if (lhs != last) { + __mmask16 mask = (__mmask16)((1 << (last - lhs)) - 1); + __m512 zmm_undefined = _mm512_undefined_ps(); + __m512 zmm_d = _mm512_mask_sub_ps( + zmm_undefined, mask, _mm512_mask_loadu_ps(zmm_undefined, mask, lhs), + _mm512_mask_loadu_ps(zmm_undefined, mask, rhs)); + zmm_sum_0 = _mm512_mask3_fmadd_ps(zmm_d, zmm_d, zmm_sum_0, mask); + } + + *distance = HorizontalAdd_FP32_V512(zmm_sum_0); #else (void)a; (void)b; (void)dim; (void)distance; -#endif // __AVX512__ +#endif // __AVX512F__ } void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__AVX512__) +#if defined(__AVX512F__) #else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; -#endif //__AVX512__ +#endif //__AVX512F__ } } // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/common.h b/src/turbo/scalar/record_quantized_int4/common.h index 13be3a2bf..c3d49e723 100644 --- a/src/turbo/scalar/record_quantized_int4/common.h +++ b/src/turbo/scalar/record_quantized_int4/common.h @@ -21,3 +21,27 @@ // overhead. #pragma once + +#include +#include + +/*! Four-bits Integer Multiplication Table + */ +static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, + 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, + 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, + 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, + 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, + 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, + 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, + 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, + 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, + 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, + 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, + 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, + 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, + 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, +}; \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/inner_product.cc b/src/turbo/scalar/record_quantized_int4/inner_product.cc index f3e183f20..206f85e10 100644 --- a/src/turbo/scalar/record_quantized_int4/inner_product.cc +++ b/src/turbo/scalar/record_quantized_int4/inner_product.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "scalar/record_quantized_int4/inner_product.h" +#include #include "scalar/record_quantized_int4/common.h" namespace zvec::turbo::scalar { @@ -21,10 +22,18 @@ namespace zvec::turbo::scalar { // vector pair. void inner_product_int4_distance(const void *a, const void *b, size_t dim, float *distance) { - (void)a; - (void)b; - (void)dim; - (void)distance; + const uint8_t *m = reinterpret_cast(a); + const uint8_t *q = reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < (dim >> 1); ++i) { + uint8_t m_val = m[i]; + uint8_t q_val = q[i]; + sum += Int4MulTable[((m_val << 4) & 0xf0) | ((q_val >> 0) & 0xf)] + + Int4MulTable[((m_val >> 0) & 0xf0) | ((q_val >> 4) & 0xf)]; + } + + *distance = -sum; } // Batch version of inner_product_int4_distance. diff --git a/src/turbo/scalar/record_quantized_int8/common.h b/src/turbo/scalar/record_quantized_int8/common.h index 13be3a2bf..92ab3736d 100644 --- a/src/turbo/scalar/record_quantized_int8/common.h +++ b/src/turbo/scalar/record_quantized_int8/common.h @@ -21,3 +21,22 @@ // overhead. #pragma once + +#include + +namespace zvec::turbo::scalar::internal { + +static __attribute__((always_inline)) void inner_product_int8_scalar( + const void *a, const void *b, size_t dim, float *distance) { + const int8_t *m = reinterpret_cast(a); + const int8_t *q = reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < dim; ++i) { + sum += static_cast(m[i] * q[i]); + } + + *distance = -sum; +} + +} // namespace zvec::turbo::scalar::internal diff --git a/src/turbo/scalar/record_quantized_int8/cosine.cc b/src/turbo/scalar/record_quantized_int8/cosine.cc index c42e0b7b1..e6a7fe170 100644 --- a/src/turbo/scalar/record_quantized_int8/cosine.cc +++ b/src/turbo/scalar/record_quantized_int8/cosine.cc @@ -15,6 +15,7 @@ #include "scalar/record_quantized_int8/cosine.h" #include #include "scalar/record_quantized_int8/common.h" +#include "scalar/record_quantized_int8/inner_product.h" namespace zvec::turbo::scalar { @@ -26,7 +27,8 @@ void cosine_int8_distance(const void *a, const void *b, size_t dim, return; } - // internal::inner_product_int8_scalar(a, b, original_dim, distance); + zvec::turbo::scalar::inner_product_int8_distance(a, b, original_dim, + distance); const float *a_tail = reinterpret_cast( reinterpret_cast(a) + original_dim); diff --git a/src/turbo/scalar/record_quantized_int8/inner_product.cc b/src/turbo/scalar/record_quantized_int8/inner_product.cc index 1927d97dd..fa7cc4a30 100644 --- a/src/turbo/scalar/record_quantized_int8/inner_product.cc +++ b/src/turbo/scalar/record_quantized_int8/inner_product.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "scalar/record_quantized_int8/inner_product.h" +#include #include "scalar/record_quantized_int8/common.h" namespace zvec::turbo::scalar { @@ -21,10 +22,29 @@ namespace zvec::turbo::scalar { // vector pair. void inner_product_int8_distance(const void *a, const void *b, size_t dim, float *distance) { - (void)a; - (void)b; - (void)dim; - (void)distance; + const size_t original_dim = dim - 20; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_scalar(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + original_dim * qb * mb); } // Batch version of inner_product_int8_distance. diff --git a/tests/turbo/turbo_quantized_integer.cc b/tests/turbo/turbo_quantized_integer_test.cc similarity index 98% rename from tests/turbo/turbo_quantized_integer.cc rename to tests/turbo/turbo_quantized_integer_test.cc index ef12b5fa4..c48c1d93c 100644 --- a/tests/turbo/turbo_quantized_integer.cc +++ b/tests/turbo/turbo_quantized_integer_test.cc @@ -81,15 +81,13 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { &qmeta_reformer)); ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - float score_float32 = ailego::Distance::MinusInnerProduct( - query_vec.data(), doc_vec.data(), DIMENSION); - - func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); - + float score_float32{0.0f}; float score_scalar{0.0f}; float score_avx2{0.0f}; float score_sse{0.0f}; + func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_scalar); From b748222d1dfe410d25509d85df22b7cf324c8d8a Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 31 Mar 2026 17:23:02 +0800 Subject: [PATCH 15/44] feat: add dist funcs --- src/turbo/avx2/record_quantized_int8/inner_product.cc | 4 ++-- src/turbo/scalar/record_quantized_int8/inner_product.cc | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/turbo/avx2/record_quantized_int8/inner_product.cc b/src/turbo/avx2/record_quantized_int8/inner_product.cc index 34ba9edd4..4745c493a 100644 --- a/src/turbo/avx2/record_quantized_int8/inner_product.cc +++ b/src/turbo/avx2/record_quantized_int8/inner_product.cc @@ -35,9 +35,9 @@ void inner_product_int8_distance(const void *a, const void *b, size_t dim, internal::inner_product_int8_avx2(a, b, original_dim, distance); const float *a_tail = reinterpret_cast( - reinterpret_cast(a) + original_dim); + reinterpret_cast(a) + original_dim); const float *b_tail = reinterpret_cast( - reinterpret_cast(b) + original_dim); + reinterpret_cast(b) + original_dim); float qa = a_tail[0]; float qb = a_tail[1]; diff --git a/src/turbo/scalar/record_quantized_int8/inner_product.cc b/src/turbo/scalar/record_quantized_int8/inner_product.cc index fa7cc4a30..115ab2992 100644 --- a/src/turbo/scalar/record_quantized_int8/inner_product.cc +++ b/src/turbo/scalar/record_quantized_int8/inner_product.cc @@ -30,10 +30,12 @@ void inner_product_int8_distance(const void *a, const void *b, size_t dim, internal::inner_product_int8_scalar(a, b, original_dim, distance); + *distance = -1 * *distance; + const float *a_tail = reinterpret_cast( - reinterpret_cast(a) + original_dim); + reinterpret_cast(a) + original_dim); const float *b_tail = reinterpret_cast( - reinterpret_cast(b) + original_dim); + reinterpret_cast(b) + original_dim); float qa = a_tail[0]; float qb = a_tail[1]; From 4f885b94affaa448765dea7377a0fc52899dbf01 Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 31 Mar 2026 17:55:33 +0800 Subject: [PATCH 16/44] feat: add dist funcs --- .../scalar/record_quantized_int4/common.h | 22 +- .../record_quantized_int4/inner_product.cc | 33 ++- src/turbo/sse/record_quantized_int4/cosine.cc | 2 +- .../record_quantized_int4/inner_product.cc | 25 +- .../inner_product_common.h | 258 ------------------ .../squared_euclidean.cc | 2 +- src/turbo/turbo.cc | 16 +- tests/turbo/turbo_quantized_integer_test.cc | 30 +- 8 files changed, 98 insertions(+), 290 deletions(-) delete mode 100644 src/turbo/sse/record_quantized_int4/inner_product_common.h diff --git a/src/turbo/scalar/record_quantized_int4/common.h b/src/turbo/scalar/record_quantized_int4/common.h index c3d49e723..32ea1408e 100644 --- a/src/turbo/scalar/record_quantized_int4/common.h +++ b/src/turbo/scalar/record_quantized_int4/common.h @@ -25,6 +25,8 @@ #include #include +namespace zvec::turbo::scalar::internal { + /*! Four-bits Integer Multiplication Table */ static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { @@ -44,4 +46,22 @@ static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, -}; \ No newline at end of file +}; + +static __attribute__((always_inline)) void inner_product_int4_scalar( + const void *a, const void *b, size_t dim, float *distance) { + const uint8_t *m = reinterpret_cast(a); + const uint8_t *q = reinterpret_cast(b); + + float sum = 0.0; + for (size_t i = 0; i < (dim >> 1); ++i) { + uint8_t m_val = m[i]; + uint8_t q_val = q[i]; + sum += Int4MulTable[((m_val << 4) & 0xf0) | ((q_val >> 0) & 0xf)] + + Int4MulTable[((m_val >> 0) & 0xf0) | ((q_val >> 4) & 0xf)]; + } + + *distance = -sum; +} + +} // namespace zvec::turbo::scalar::internal \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/inner_product.cc b/src/turbo/scalar/record_quantized_int4/inner_product.cc index 206f85e10..406b68976 100644 --- a/src/turbo/scalar/record_quantized_int4/inner_product.cc +++ b/src/turbo/scalar/record_quantized_int4/inner_product.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "scalar/record_quantized_int4/inner_product.h" -#include #include "scalar/record_quantized_int4/common.h" namespace zvec::turbo::scalar { @@ -22,18 +21,30 @@ namespace zvec::turbo::scalar { // vector pair. void inner_product_int4_distance(const void *a, const void *b, size_t dim, float *distance) { - const uint8_t *m = reinterpret_cast(a); - const uint8_t *q = reinterpret_cast(b); - - float sum = 0.0; - for (size_t i = 0; i < (dim >> 1); ++i) { - uint8_t m_val = m[i]; - uint8_t q_val = q[i]; - sum += Int4MulTable[((m_val << 4) & 0xf0) | ((q_val >> 0) & 0xf)] + - Int4MulTable[((m_val >> 0) & 0xf0) | ((q_val >> 4) & 0xf)]; + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; } - *distance = -sum; + internal::inner_product_int4_scalar(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = + -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + d * qb * mb); } // Batch version of inner_product_int4_distance. diff --git a/src/turbo/sse/record_quantized_int4/cosine.cc b/src/turbo/sse/record_quantized_int4/cosine.cc index 1b955d983..2a87508f5 100644 --- a/src/turbo/sse/record_quantized_int4/cosine.cc +++ b/src/turbo/sse/record_quantized_int4/cosine.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "sse/record_quantized_int4/cosine.h" -#include "sse/record_quantized_int4/inner_product_common.h" +#include "sse/record_quantized_int4/common.h" #if defined(__SSE__) #include #endif diff --git a/src/turbo/sse/record_quantized_int4/inner_product.cc b/src/turbo/sse/record_quantized_int4/inner_product.cc index 33a889f5f..29c04b718 100644 --- a/src/turbo/sse/record_quantized_int4/inner_product.cc +++ b/src/turbo/sse/record_quantized_int4/inner_product.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "sse/record_quantized_int4/inner_product.h" -#include "sse/record_quantized_int4/inner_product_common.h" +#include "sse/record_quantized_int4/common.h" #if defined(__SSE__) #include @@ -26,7 +26,30 @@ namespace zvec::turbo::sse { void inner_product_int4_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__SSE__) + const int d = dim - 32; + const size_t original_dim = d >> 1; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = + -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + d * qb * mb); #else (void)a; (void)b; diff --git a/src/turbo/sse/record_quantized_int4/inner_product_common.h b/src/turbo/sse/record_quantized_int4/inner_product_common.h deleted file mode 100644 index 6d12504e3..000000000 --- a/src/turbo/sse/record_quantized_int4/inner_product_common.h +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright 2025-present the zvec project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - -#pragma once - -#if defined(__AVX2__) -#include -#include -#include -#include - -namespace zvec::turbo::avx2::internal { - - -/*! Four-bits Integer Multiplication Table - */ -static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, - 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, - 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, - 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, - 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, - 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, - 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, - 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, - 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, - 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, - 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, - 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, - 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, - 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, - 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, -}; - -//! Calculate Fused-Multiply-Add (GENERAL) -#define FMA_INT4_GENERAL(m, q, sum) \ - sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \ - Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)]; - -static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { - __m256i x1 = _mm256_hadd_epi32(v, v); - __m256i x2 = _mm256_hadd_epi32(x1, x1); - __m128i x3 = _mm256_extractf128_si256(x2, 1); - __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); - return _mm_cvtsi128_si32(x4); -} - -#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f) -#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) - -#define MASK_INT4_AVX _mm256_set1_epi32(0xf0f0f0f0) -#define ONES_INT16_AVX _mm256_set1_epi32(0x00010001) - -static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { - 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, - 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}; - -#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) - -#define INT4_LOOKUP_AVX _mm256_load_si256((const __m256i *)Int4ConvertTable) - -#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable) - -//! Compute the distance between matrix and query -#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) \ - { \ - __m128i xmm_lhs_0 = _mm_shuffle_epi8( \ - INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE)); \ - __m128i xmm_rhs_0 = _mm_shuffle_epi8( \ - INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE)); \ - __m128i xmm_lhs_1 = _mm_shuffle_epi8( \ - INT4_LOOKUP_SSE, \ - _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE)); \ - __m128i xmm_rhs_1 = _mm_shuffle_epi8( \ - INT4_LOOKUP_SSE, \ - _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE)); \ - xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); \ - xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); \ - xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); \ - xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); \ - xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), \ - ONES_INT16_SSE); \ - xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), \ - ONES_INT16_SSE); \ - xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \ - } - -#define FMA_INT4_ITER_AVX(ymm_lhs, ymm_rhs, ymm_sum) \ - { \ - __m256i ymm_lhs_0 = _mm256_shuffle_epi8( \ - INT4_LOOKUP_AVX, _mm256_and_si256((ymm_lhs), MASK_INT4_AVX)); \ - __m256i ymm_rhs_0 = _mm256_shuffle_epi8( \ - INT4_LOOKUP_AVX, _mm256_and_si256((ymm_rhs), MASK_INT4_AVX)); \ - __m256i ymm_lhs_1 = _mm256_shuffle_epi8( \ - INT4_LOOKUP_AVX, \ - _mm256_and_si256(_mm256_srli_epi32((ymm_lhs), 4), MASK_INT4_AVX)); \ - __m256i ymm_rhs_1 = _mm256_shuffle_epi8( \ - INT4_LOOKUP_AVX, \ - _mm256_and_si256(_mm256_srli_epi32((ymm_rhs), 4), MASK_INT4_AVX)); \ - ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); \ - ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); \ - ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); \ - ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); \ - ymm_lhs_0 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), \ - ONES_INT16_AVX); \ - ymm_lhs_1 = _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), \ - ONES_INT16_AVX); \ - ymm_sum = \ - _mm256_add_epi32(_mm256_add_epi32(ymm_lhs_0, ymm_lhs_1), ymm_sum); \ - } - -#if defined(__SSE2__) -static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { -#ifdef __SSE3__ - __m128i x1 = _mm_hadd_epi32(v, v); - __m128i x2 = _mm_hadd_epi32(x1, x1); - return _mm_cvtsi128_si32(x2); -#else - __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); - __m128i x2 = _mm_add_epi32(v, x1); - __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); - __m128i x4 = _mm_add_epi32(x2, x3); - return _mm_cvtsi128_si32(x4); -#endif -} -#endif // __SSE2__ - -//! Compute the distance between matrix and query -static __attribute__((always_inline)) void inner_product_int4_avx2( - const void *a, const void *b, size_t size, float *distance) { - const uint8_t *lhs = reinterpret_cast(a); - const uint8_t *rhs = reinterpret_cast(b); - const uint8_t *last = lhs + size; - const uint8_t *last_aligned = lhs + ((size >> 4) << 4); - __m128i xmm_sum = _mm_setzero_si128(); - - if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { - for (; lhs != last_aligned; lhs += 16, rhs += 16) { - __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs)); - __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs)); - FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) - } - } else { - for (; lhs != last_aligned; lhs += 16, rhs += 16) { - __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs)); - __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs)); - FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) - } - } - float result = static_cast(HorizontalAdd_INT32_V128(xmm_sum)); - - switch (last - lhs) { - case 15: - FMA_INT4_GENERAL(lhs[14], rhs[14], result) - /* FALLTHRU */ - case 14: - FMA_INT4_GENERAL(lhs[13], rhs[13], result) - /* FALLTHRU */ - case 13: - FMA_INT4_GENERAL(lhs[12], rhs[12], result) - /* FALLTHRU */ - case 12: - FMA_INT4_GENERAL(lhs[11], rhs[11], result) - /* FALLTHRU */ - case 11: - FMA_INT4_GENERAL(lhs[10], rhs[10], result) - /* FALLTHRU */ - case 10: - FMA_INT4_GENERAL(lhs[9], rhs[9], result) - /* FALLTHRU */ - case 9: - FMA_INT4_GENERAL(lhs[8], rhs[8], result) - /* FALLTHRU */ - case 8: - FMA_INT4_GENERAL(lhs[7], rhs[7], result) - /* FALLTHRU */ - case 7: - FMA_INT4_GENERAL(lhs[6], rhs[6], result) - /* FALLTHRU */ - case 6: - FMA_INT4_GENERAL(lhs[5], rhs[5], result) - /* FALLTHRU */ - case 5: - FMA_INT4_GENERAL(lhs[4], rhs[4], result) - /* FALLTHRU */ - case 4: - FMA_INT4_GENERAL(lhs[3], rhs[3], result) - /* FALLTHRU */ - case 3: - FMA_INT4_GENERAL(lhs[2], rhs[2], result) - /* FALLTHRU */ - case 2: - FMA_INT4_GENERAL(lhs[1], rhs[1], result) - /* FALLTHRU */ - case 1: - FMA_INT4_GENERAL(lhs[0], rhs[0], result) - } - - *distance = result; -} - -// Compute raw integer inner products for a batch of int8 vectors against a -// single query. Uses AVX512-VNNI dpbusd instruction. -// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. -template -__attribute__((always_inline)) void inner_product_int4_batch_avx2_impl( - const void *query, const void *const *vectors, - const std::array &prefetch_ptrs, - size_t dimensionality, float *distances) {} - -static __attribute__((always_inline)) void inner_product_int4_batch_avx2( - const void *const *vectors, const void *query, size_t n, size_t dim, - float *distances) { - static constexpr size_t batch_size = 2; - static constexpr size_t prefetch_step = 2; - size_t i = 0; - for (; i + batch_size <= n; i += batch_size) { - std::array prefetch_ptrs; - for (size_t j = 0; j < batch_size; ++j) { - if (i + j + batch_size * prefetch_step < n) { - prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; - } else { - prefetch_ptrs[j] = nullptr; - } - } - inner_product_int4_batch_avx2_impl( - query, &vectors[i], prefetch_ptrs, dim, distances + i); - } - for (; i < n; i++) { - std::array prefetch_ptrs{nullptr}; - inner_product_int4_batch_avx2_impl<1>(query, &vectors[i], prefetch_ptrs, - dim, distances + i); - } -} - -} // namespace zvec::turbo::avx2::internal - -#endif // defined(__AVX2__) diff --git a/src/turbo/sse/record_quantized_int4/squared_euclidean.cc b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc index 0b4d34cd9..c771ffb19 100644 --- a/src/turbo/sse/record_quantized_int4/squared_euclidean.cc +++ b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "sse/record_quantized_int4/squared_euclidean.h" -#include "sse/record_quantized_int4/inner_product_common.h" +#include "sse/record_quantized_int4/common.h" #if defined(__SSE__) #include diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index 748b840d2..86893a069 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -137,15 +137,13 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, } } - // if (metric_type == MetricType::kSquaredEuclidean) { - // return scalar::squared_euclidean_int4_distance; - // } - // else if (metric_type == MetricType::kCosine) { - // return scalar::cosine_int4_distance; - // } - // else if (metric_type == MetricType::kInnerProduct) { - // return scalar::inner_product_int4_distance; - // } + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_int4_distance; + } else if (metric_type == MetricType::kCosine) { + return scalar::cosine_int4_distance; + } else if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_int4_distance; + } } } diff --git a/tests/turbo/turbo_quantized_integer_test.cc b/tests/turbo/turbo_quantized_integer_test.cc index c48c1d93c..587203108 100644 --- a/tests/turbo/turbo_quantized_integer_test.cc +++ b/tests/turbo/turbo_quantized_integer_test.cc @@ -109,16 +109,19 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); - const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; const size_t COUNT = 1000; - auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); + auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); ASSERT_TRUE(!!converter); ASSERT_EQ(0u, converter->init(meta, Params())); auto &convert_meta = converter->meta(); auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); auto func_avx2 = turbo::get_distance_func( turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, @@ -128,6 +131,10 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + ailego::NumericalVector query_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { query_vec[j] = dist(gen); @@ -153,19 +160,26 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { &qmeta_reformer)); ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - float score_float = ailego::Distance::MinusInnerProduct( - query_vec.data(), doc_vec.data(), DIMENSION); - + float score_float32{0.0f}; + float score_scalar{0.0f}; float score_avx2{0.0f}; float score_sse{0.0f}; + func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_avx2); + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_sse); - ASSERT_NEAR(score_float, score_avx2, 0.2 * DIMENSION); - ASSERT_NEAR(score_float, score_sse, 0.2 * DIMENSION); - ASSERT_NEAR(score_avx2, score_sse, 0.001); + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); } } From cf017bcc09c4f9e374d699aabe0dd5e3a9e82982 Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 31 Mar 2026 20:06:34 +0800 Subject: [PATCH 17/44] feat: add dist funcs --- .../squared_euclidean.cc | 26 ++ src/turbo/avx512/float32/cosine.cc | 17 +- .../squared_euclidean.cc | 33 +- .../squared_euclidean.cc | 32 +- src/turbo/sse/record_quantized_int4/common.h | 182 +++++++++ .../record_quantized_int4/inner_product.cc | 12 +- .../squared_euclidean.cc | 38 +- .../squared_euclidean.cc | 26 ++ tests/turbo/turbo_quantized_integer_test.cc | 346 ++++++++++++++++++ 9 files changed, 688 insertions(+), 24 deletions(-) create mode 100644 src/turbo/sse/record_quantized_int4/common.h diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc b/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc index 2d493602b..0c3c71079 100644 --- a/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc +++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean.cc @@ -24,7 +24,33 @@ namespace zvec::turbo::avx2 { void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX2__) + const int original_dim = dim - 20; + if (original_dim <= 0) { + return; + } + internal::inner_product_int8_avx2(a, b, original_dim, distance); + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float ma = a_tail[0]; + float mb = a_tail[1]; + float ms = a_tail[2]; + float ms2 = a_tail[3]; + + float qa = b_tail[0]; + float qb = b_tail[1]; + float qs = b_tail[2]; + float qs2 = b_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * original_dim + + 2 * (mb - qb) * (ms * ma - sum); #else (void)a; (void)b; diff --git a/src/turbo/avx512/float32/cosine.cc b/src/turbo/avx512/float32/cosine.cc index 9eb6b5b00..78ee5e4a7 100644 --- a/src/turbo/avx512/float32/cosine.cc +++ b/src/turbo/avx512/float32/cosine.cc @@ -14,8 +14,9 @@ #include "avx512/float32/cosine.h" #include "avx512/float32/common.h" +#include "avx512/float32/inner_product.h" -#if defined(__AVX512__) +#if defined(__AVX512F__) #include #endif @@ -23,19 +24,25 @@ namespace zvec::turbo::avx512 { void cosine_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__AVX2__) +#if defined(__AVX512F__) + constexpr size_t extra_dim = 1; + size_t d = dim - extra_dim; + float ip; + inner_product_fp32_distance(a, b, d, &ip); + + *distance = 1 - ip; #else (void)a; (void)b; (void)dim; (void)distance; -#endif // __AVX2__ +#endif // __AVX512F__ } void cosine_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__AVX2__) +#if defined(__AVX512F__) #else (void)vectors; @@ -43,7 +50,7 @@ void cosine_fp32_batch_distance(const void *const *vectors, const void *query, (void)n; (void)dim; (void)distances; -#endif //__AVX2__ +#endif //__AVX512F__ } } // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc index 555cc85a5..0feb7eae1 100644 --- a/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc +++ b/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc @@ -19,10 +19,35 @@ namespace zvec::turbo::scalar { void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, float *distance) { - (void)a; - (void)b; - (void)dim; - (void)distance; + const int d = dim - 32; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_scalar(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + float qs2 = a_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + float ms2 = b_tail[3]; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); } void squared_euclidean_int4_batch_distance(const void *const *vectors, diff --git a/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc index aa8b7be66..82d5180c9 100644 --- a/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc +++ b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc @@ -19,10 +19,34 @@ namespace zvec::turbo::scalar { void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, float *distance) { - (void)a; - (void)b; - (void)dim; - (void)distance; + const int original_dim = dim - 20; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int8_scalar(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float ma = a_tail[0]; + float mb = a_tail[1]; + float ms = a_tail[2]; + float ms2 = a_tail[3]; + + float qa = b_tail[0]; + float qb = b_tail[1]; + float qs = b_tail[2]; + float qs2 = b_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * original_dim + + 2 * (mb - qb) * (ms * ma - sum); } void squared_euclidean_int8_batch_distance(const void *const *vectors, diff --git a/src/turbo/sse/record_quantized_int4/common.h b/src/turbo/sse/record_quantized_int4/common.h new file mode 100644 index 000000000..66ba30fa0 --- /dev/null +++ b/src/turbo/sse/record_quantized_int4/common.h @@ -0,0 +1,182 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__SSE4_1__) +#include +#include +#include +#include + +namespace zvec::turbo::sse::internal { + +//! Four-bits Convert Table +static const AILEGO_ALIGNED(32) int8_t Int4ConvertTable[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}; + +/*! Four-bits Integer Multiplication Table + */ +static const AILEGO_ALIGNED(64) int8_t Int4MulTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 2, 4, 6, 8, 10, 12, 14, -16, -14, -12, -10, -8, -6, -4, -2, + 0, 3, 6, 9, 12, 15, 18, 21, -24, -21, -18, -15, -12, -9, -6, -3, + 0, 4, 8, 12, 16, 20, 24, 28, -32, -28, -24, -20, -16, -12, -8, -4, + 0, 5, 10, 15, 20, 25, 30, 35, -40, -35, -30, -25, -20, -15, -10, -5, + 0, 6, 12, 18, 24, 30, 36, 42, -48, -42, -36, -30, -24, -18, -12, -6, + 0, 7, 14, 21, 28, 35, 42, 49, -56, -49, -42, -35, -28, -21, -14, -7, + 0, -8, -16, -24, -32, -40, -48, -56, 64, 56, 48, 40, 32, 24, 16, 8, + 0, -7, -14, -21, -28, -35, -42, -49, 56, 49, 42, 35, 28, 21, 14, 7, + 0, -6, -12, -18, -24, -30, -36, -42, 48, 42, 36, 30, 24, 18, 12, 6, + 0, -5, -10, -15, -20, -25, -30, -35, 40, 35, 30, 25, 20, 15, 10, 5, + 0, -4, -8, -12, -16, -20, -24, -28, 32, 28, 24, 20, 16, 12, 8, 4, + 0, -3, -6, -9, -12, -15, -18, -21, 24, 21, 18, 15, 12, 9, 6, 3, + 0, -2, -4, -6, -8, -10, -12, -14, 16, 14, 12, 10, 8, 6, 4, 2, + 0, -1, -2, -3, -4, -5, -6, -7, 8, 7, 6, 5, 4, 3, 2, 1, +}; + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_INT4_GENERAL(m, q, sum) \ + sum += Int4MulTable[(((m) << 4) & 0xf0) | (((q) >> 0) & 0xf)] + \ + Int4MulTable[(((m) >> 0) & 0xf0) | (((q) >> 4) & 0xf)]; + +#define MASK_INT4_SSE _mm_set1_epi32(0x0f0f0f0f) +#define ONES_INT16_SSE _mm_set1_epi32(0x00010001) +#define INT4_LOOKUP_SSE _mm_load_si128((const __m128i *)Int4ConvertTable) + +//! Compute the distance between matrix and query +#define FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) \ + { \ + __m128i xmm_lhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_lhs), MASK_INT4_SSE)); \ + __m128i xmm_rhs_0 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, _mm_and_si128((xmm_rhs), MASK_INT4_SSE)); \ + __m128i xmm_lhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_lhs), 4), MASK_INT4_SSE)); \ + __m128i xmm_rhs_1 = _mm_shuffle_epi8( \ + INT4_LOOKUP_SSE, \ + _mm_and_si128(_mm_srli_epi32((xmm_rhs), 4), MASK_INT4_SSE)); \ + xmm_lhs_0 = _mm_sign_epi8(xmm_lhs_0, xmm_rhs_0); \ + xmm_lhs_1 = _mm_sign_epi8(xmm_lhs_1, xmm_rhs_1); \ + xmm_rhs_0 = _mm_abs_epi8(xmm_rhs_0); \ + xmm_rhs_1 = _mm_abs_epi8(xmm_rhs_1); \ + xmm_lhs_0 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_0, xmm_lhs_0), \ + ONES_INT16_SSE); \ + xmm_lhs_1 = _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs_1, xmm_lhs_1), \ + ONES_INT16_SSE); \ + xmm_sum = _mm_add_epi32(_mm_add_epi32(xmm_lhs_0, xmm_lhs_1), xmm_sum); \ + } + +static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { +#ifdef __SSE3__ + __m128i x1 = _mm_hadd_epi32(v, v); + __m128i x2 = _mm_hadd_epi32(x1, x1); + return _mm_cvtsi128_si32(x2); +#else + __m128i x1 = _mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)); + __m128i x2 = _mm_add_epi32(v, x1); + __m128i x3 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0, 0, 0, 1)); + __m128i x4 = _mm_add_epi32(x2, x3); + return _mm_cvtsi128_si32(x4); +#endif +} + +static __attribute__((always_inline)) void inner_product_int4_sse( + const void *a, const void *b, size_t size, float *distance) { + const uint8_t *lhs = reinterpret_cast(a); + const uint8_t *rhs = reinterpret_cast(b); + + const uint8_t *last = lhs + size; + const uint8_t *last_aligned = lhs + ((size >> 4) << 4); + __m128i xmm_sum = _mm_setzero_si128(); + + if (((uintptr_t)lhs & 0xf) == 0 && ((uintptr_t)rhs & 0xf) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)(lhs)); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)(rhs)); + FMA_INT4_ITER_SSE(xmm_lhs, xmm_rhs, xmm_sum) + } + } + float result = static_cast(HorizontalAdd_INT32_V128(xmm_sum)); + + switch (last - lhs) { + case 15: + FMA_INT4_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT4_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT4_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT4_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT4_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT4_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT4_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT4_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT4_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT4_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT4_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT4_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT4_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT4_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT4_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; +} + +} // namespace zvec::turbo::sse::internal + +#endif // defined(__SSE4_1__) diff --git a/src/turbo/sse/record_quantized_int4/inner_product.cc b/src/turbo/sse/record_quantized_int4/inner_product.cc index 29c04b718..47121a668 100644 --- a/src/turbo/sse/record_quantized_int4/inner_product.cc +++ b/src/turbo/sse/record_quantized_int4/inner_product.cc @@ -15,17 +15,17 @@ #include "sse/record_quantized_int4/inner_product.h" #include "sse/record_quantized_int4/common.h" -#if defined(__SSE__) +#if defined(__SSE4_1__) #include #endif namespace zvec::turbo::sse { -// Compute squared Euclidean distance between a single quantized INT4 +// Compute squared inner product distance between a single quantized INT4 // vector pair. void inner_product_int4_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__SSE__) +#if defined(__SSE4_1__) const int d = dim - 32; const size_t original_dim = d >> 1; @@ -55,14 +55,14 @@ void inner_product_int4_distance(const void *a, const void *b, size_t dim, (void)b; (void)dim; (void)distance; -#endif //__SSE__ +#endif //__SSE4_1__ } // Batch version of inner_product_int4_distance. void inner_product_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__SSE__) +#if defined(__SSE4_1__) #else (void)vectors; @@ -70,7 +70,7 @@ void inner_product_int4_batch_distance(const void *const *vectors, (void)n; (void)dim; (void)distances; -#endif //__SSE__ +#endif //__SSE4_1__ } } // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int4/squared_euclidean.cc b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc index c771ffb19..59155e2f3 100644 --- a/src/turbo/sse/record_quantized_int4/squared_euclidean.cc +++ b/src/turbo/sse/record_quantized_int4/squared_euclidean.cc @@ -15,7 +15,7 @@ #include "sse/record_quantized_int4/squared_euclidean.h" #include "sse/record_quantized_int4/common.h" -#if defined(__SSE__) +#if defined(__SSE4_1__) #include #endif @@ -23,20 +23,48 @@ namespace zvec::turbo::sse { void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__SSE__) +#if defined(__SSE4_1__) + const int d = dim - 32; + const size_t original_dim = d >> 1; + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + float qs2 = a_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + float ms2 = b_tail[3]; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); #else (void)a; (void)b; (void)dim; (void)distance; -#endif // __SSE__ +#endif // __SSE4_1__ } void squared_euclidean_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__SSE__) +#if defined(__SSE4_1__) #else (void)vectors; @@ -44,7 +72,7 @@ void squared_euclidean_int4_batch_distance(const void *const *vectors, (void)n; (void)dim; (void)distances; -#endif //__SSE__ +#endif //__SSE4_1__ } } // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/squared_euclidean.cc b/src/turbo/sse/record_quantized_int8/squared_euclidean.cc index d51ee0cf6..3fb001204 100644 --- a/src/turbo/sse/record_quantized_int8/squared_euclidean.cc +++ b/src/turbo/sse/record_quantized_int8/squared_euclidean.cc @@ -23,7 +23,33 @@ namespace zvec::turbo::sse { void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__SSE__) + const int original_dim = dim - 20; + if (original_dim <= 0) { + return; + } + internal::inner_product_int8_sse(a, b, original_dim, distance); + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float ma = a_tail[0]; + float mb = a_tail[1]; + float ms = a_tail[2]; + float ms2 = a_tail[3]; + + float qa = b_tail[0]; + float qb = b_tail[1]; + float qs = b_tail[2]; + float qs2 = b_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * original_dim + + 2 * (mb - qb) * (ms * ma - sum); #else (void)a; (void)b; diff --git a/tests/turbo/turbo_quantized_integer_test.cc b/tests/turbo/turbo_quantized_integer_test.cc index 587203108..8d09f97cd 100644 --- a/tests/turbo/turbo_quantized_integer_test.cc +++ b/tests/turbo/turbo_quantized_integer_test.cc @@ -35,6 +35,7 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); ASSERT_TRUE(!!converter); ASSERT_EQ(0u, converter->init(meta, Params())); auto &convert_meta = converter->meta(); @@ -114,6 +115,7 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); ASSERT_TRUE(!!converter); ASSERT_EQ(0u, converter->init(meta, Params())); auto &convert_meta = converter->meta(); @@ -140,6 +142,85 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { query_vec[j] = dist(gen); } + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + // ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + // ASSERT_NEAR(score_scalar, score_avx2, 0.001); + // ASSERT_NEAR(score_scalar, score_sse, 0.001); + } +} + +TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1000; + + auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { @@ -183,3 +264,268 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { ASSERT_NEAR(score_scalar, score_sse, 0.001); } } + +TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1000; + + auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + // ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + // ASSERT_NEAR(score_scalar, score_avx2, 0.001); + // ASSERT_NEAR(score_scalar, score_sse, 0.001); + } +} + +TEST(QuantizedIntegerMetric, TestInt8Cosine) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1000; + + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + + // fp32 converter + auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter"); + ASSERT_TRUE(!!fp32_converter); + ASSERT_EQ(0u, fp32_converter->init(meta, Params())); + + auto &fp32_convert_meta = fp32_converter->meta(); + auto fp32_reformer = + IndexFactory::CreateReformer(fp32_convert_meta.reformer_name()); + + // int8 converter + auto converter = IndexFactory::CreateConverter("CosineInt8Converter"); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta fp32_qmeta_reformer; + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + std::string fp32_query_out; + ASSERT_EQ(0, + fp32_reformer->transform(query_vec.data(), qmeta, &fp32_query_out, + &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + std::string fp32_doc_out; + ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out, + &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + func_float32(fp32_query_out.data(), fp32_doc_out.data(), + fp32_qmeta_reformer.dimension(), &score_float32); + + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); + } +} + +TEST(QuantizedIntegerMetric, TestInt4Cosine) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1000; + + auto converter = IndexFactory::CreateConverter("CosineInt4Converter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_float32 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto func_avx2 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto func_sse = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx2); + + func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_sse); + + ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); + ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); + // ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + // ASSERT_NEAR(score_scalar, score_avx2, 0.001); + // ASSERT_NEAR(score_scalar, score_sse, 0.001); + } +} From faa7e643d0faccc78b3d545d62a7f5178a4ec24e Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 31 Mar 2026 20:33:22 +0800 Subject: [PATCH 18/44] feat: add fp16 funcs --- src/turbo/avx/half_float/common.h | 23 +++++++++ src/turbo/avx/half_float/cosine.cc | 49 +++++++++++++++++++ src/turbo/avx/half_float/cosine.h | 30 ++++++++++++ src/turbo/avx/half_float/inner_product.cc | 45 +++++++++++++++++ src/turbo/avx/half_float/inner_product.h | 31 ++++++++++++ src/turbo/avx/half_float/squared_euclidean.cc | 49 +++++++++++++++++++ src/turbo/avx/half_float/squared_euclidean.h | 31 ++++++++++++ .../common.h | 0 src/turbo/avx512/half_float/cosine.cc | 49 +++++++++++++++++++ src/turbo/avx512/half_float/cosine.h | 30 ++++++++++++ src/turbo/avx512/half_float/inner_product.cc | 45 +++++++++++++++++ src/turbo/avx512/half_float/inner_product.h | 31 ++++++++++++ .../avx512/half_float/squared_euclidean.cc | 49 +++++++++++++++++++ .../avx512/half_float/squared_euclidean.h | 31 ++++++++++++ 14 files changed, 493 insertions(+) create mode 100644 src/turbo/avx/half_float/common.h create mode 100644 src/turbo/avx/half_float/cosine.cc create mode 100644 src/turbo/avx/half_float/cosine.h create mode 100644 src/turbo/avx/half_float/inner_product.cc create mode 100644 src/turbo/avx/half_float/inner_product.h create mode 100644 src/turbo/avx/half_float/squared_euclidean.cc create mode 100644 src/turbo/avx/half_float/squared_euclidean.h rename src/turbo/avx512/{half_float_converter => half_float}/common.h (100%) create mode 100644 src/turbo/avx512/half_float/cosine.cc create mode 100644 src/turbo/avx512/half_float/cosine.h create mode 100644 src/turbo/avx512/half_float/inner_product.cc create mode 100644 src/turbo/avx512/half_float/inner_product.h create mode 100644 src/turbo/avx512/half_float/squared_euclidean.cc create mode 100644 src/turbo/avx512/half_float/squared_euclidean.h diff --git a/src/turbo/avx/half_float/common.h b/src/turbo/avx/half_float/common.h new file mode 100644 index 000000000..13be3a2bf --- /dev/null +++ b/src/turbo/avx/half_float/common.h @@ -0,0 +1,23 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once diff --git a/src/turbo/avx/half_float/cosine.cc b/src/turbo/avx/half_float/cosine.cc new file mode 100644 index 000000000..ff319539a --- /dev/null +++ b/src/turbo/avx/half_float/cosine.cc @@ -0,0 +1,49 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/float32/cosine.h" +#include "avx/float32/common.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/half_float/cosine.h b/src/turbo/avx/half_float/cosine.h new file mode 100644 index 000000000..5bd0a66f5 --- /dev/null +++ b/src/turbo/avx/half_float/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP16 vector pair. +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp16_distance. +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/half_float/inner_product.cc b/src/turbo/avx/half_float/inner_product.cc new file mode 100644 index 000000000..707fb12c2 --- /dev/null +++ b/src/turbo/avx/half_float/inner_product.cc @@ -0,0 +1,45 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/float32/inner_product.h" +#include "avx/float32/common.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +// Compute squared Euclidean distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { + (void)a; + (void)b; + (void)dim; + (void)distance; +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/half_float/inner_product.h b/src/turbo/avx/half_float/inner_product.h new file mode 100644 index 000000000..083a35f6f --- /dev/null +++ b/src/turbo/avx/half_float/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx diff --git a/src/turbo/avx/half_float/squared_euclidean.cc b/src/turbo/avx/half_float/squared_euclidean.cc new file mode 100644 index 000000000..c81bb2e2c --- /dev/null +++ b/src/turbo/avx/half_float/squared_euclidean.cc @@ -0,0 +1,49 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/float32/squared_euclidean.h" +#include "avx/float32/common.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX__) +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/half_float/squared_euclidean.h b/src/turbo/avx/half_float/squared_euclidean.h new file mode 100644 index 000000000..013b1f118 --- /dev/null +++ b/src/turbo/avx/half_float/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx diff --git a/src/turbo/avx512/half_float_converter/common.h b/src/turbo/avx512/half_float/common.h similarity index 100% rename from src/turbo/avx512/half_float_converter/common.h rename to src/turbo/avx512/half_float/common.h diff --git a/src/turbo/avx512/half_float/cosine.cc b/src/turbo/avx512/half_float/cosine.cc new file mode 100644 index 000000000..76791ad8a --- /dev/null +++ b/src/turbo/avx512/half_float/cosine.cc @@ -0,0 +1,49 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/float32/cosine.h" +#include "avx/float32/common.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx512/half_float/cosine.h b/src/turbo/avx512/half_float/cosine.h new file mode 100644 index 000000000..514a705e0 --- /dev/null +++ b/src/turbo/avx512/half_float/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx512/half_float/inner_product.cc b/src/turbo/avx512/half_float/inner_product.cc new file mode 100644 index 000000000..5e34f0bb6 --- /dev/null +++ b/src/turbo/avx512/half_float/inner_product.cc @@ -0,0 +1,45 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/float32/inner_product.h" +#include "avx/float32/common.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +// Compute squared Euclidean distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { + (void)a; + (void)b; + (void)dim; + (void)distance; +} + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx512/half_float/inner_product.h b/src/turbo/avx512/half_float/inner_product.h new file mode 100644 index 000000000..083a35f6f --- /dev/null +++ b/src/turbo/avx512/half_float/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx diff --git a/src/turbo/avx512/half_float/squared_euclidean.cc b/src/turbo/avx512/half_float/squared_euclidean.cc new file mode 100644 index 000000000..710738d24 --- /dev/null +++ b/src/turbo/avx512/half_float/squared_euclidean.cc @@ -0,0 +1,49 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx/float32/squared_euclidean.h" +#include "avx/float32/common.h" + +#if defined(__AVX__) +#include +#endif + +namespace zvec::turbo::avx { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX__) +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX__ +} + +} // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx512/half_float/squared_euclidean.h b/src/turbo/avx512/half_float/squared_euclidean.h new file mode 100644 index 000000000..9e11f15bc --- /dev/null +++ b/src/turbo/avx512/half_float/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx From c073035cbb0a980aaf3685aff06236ae62ac0205 Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 31 Mar 2026 21:12:42 +0800 Subject: [PATCH 19/44] feat: add dist funcs --- src/turbo/avx/float32/cosine.cc | 7 ++ src/turbo/avx/float32/inner_product.cc | 70 +++++++++++++++++++ src/turbo/avx/float32/squared_euclidean.cc | 68 ++++++++++++++++++ src/turbo/avx/half_float/common.h | 23 ------ src/turbo/avx/half_float/cosine.cc | 7 ++ .../avx/half_float/euclidean_squared_common.h | 69 ++++++++++++++++++ src/turbo/avx/half_float/inner_product.cc | 4 ++ .../avx/half_float/inner_product_common.h | 66 +++++++++++++++++ src/turbo/avx/half_float/squared_euclidean.cc | 2 +- 9 files changed, 292 insertions(+), 24 deletions(-) delete mode 100644 src/turbo/avx/half_float/common.h create mode 100644 src/turbo/avx/half_float/euclidean_squared_common.h create mode 100644 src/turbo/avx/half_float/inner_product_common.h diff --git a/src/turbo/avx/float32/cosine.cc b/src/turbo/avx/float32/cosine.cc index 76791ad8a..a05ba5e39 100644 --- a/src/turbo/avx/float32/cosine.cc +++ b/src/turbo/avx/float32/cosine.cc @@ -14,6 +14,7 @@ #include "avx/float32/cosine.h" #include "avx/float32/common.h" +#include "avx/float32/inner_product.h" #if defined(__AVX__) #include @@ -24,7 +25,13 @@ namespace zvec::turbo::avx { void cosine_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX__) + constexpr size_t extra_dim = 1; + size_t d = dim - extra_dim; + float ip; + inner_product_fp32_avx(m, q, d, &ip); + + *out = 1 - ip; #else (void)a; (void)b; diff --git a/src/turbo/avx/float32/inner_product.cc b/src/turbo/avx/float32/inner_product.cc index 5e34f0bb6..9a9a99a6e 100644 --- a/src/turbo/avx/float32/inner_product.cc +++ b/src/turbo/avx/float32/inner_product.cc @@ -25,10 +25,80 @@ namespace zvec::turbo::avx { // vector pair. void inner_product_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { +#if defined(__AVX__) + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + size; + const float *last_aligned = lhs + ((dim >> 4) << 4); + + __m256 ymm_sum_0 = _mm256_setzero_ps(); + __m256 ymm_sum_1 = _mm256_setzero_ps(); + + if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m256 ymm_lhs_0 = _mm256_load_ps(lhs + 0); + __m256 ymm_lhs_1 = _mm256_load_ps(lhs + 8); + __m256 ymm_rhs_0 = _mm256_load_ps(rhs + 0); + __m256 ymm_rhs_1 = _mm256_load_ps(rhs + 8); + ymm_sum_0 = _mm256_fmadd_ps(ymm_lhs_0, ymm_rhs_0, ymm_sum_0); + ymm_sum_1 = _mm256_fmadd_ps(ymm_lhs_1, ymm_rhs_1, ymm_sum_1); + } + + if (last >= last_aligned + 8) { + ymm_sum_0 = + _mm256_fmadd_ps(_mm256_load_ps(lhs), _mm256_load_ps(rhs), ymm_sum_0); + lhs += 8; + rhs += 8; + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m256 ymm_lhs_0 = _mm256_loadu_ps(lhs + 0); + __m256 ymm_lhs_1 = _mm256_loadu_ps(lhs + 8); + __m256 ymm_rhs_0 = _mm256_loadu_ps(rhs + 0); + __m256 ymm_rhs_1 = _mm256_loadu_ps(rhs + 8); + ymm_sum_0 = _mm256_fmadd_ps(ymm_lhs_0, ymm_rhs_0, ymm_sum_0); + ymm_sum_1 = _mm256_fmadd_ps(ymm_lhs_1, ymm_rhs_1, ymm_sum_1); + } + + if (last >= last_aligned + 8) { + ymm_sum_0 = _mm256_fmadd_ps(_mm256_loadu_ps(lhs), _mm256_loadu_ps(rhs), + ymm_sum_0); + lhs += 8; + rhs += 8; + } + } + float result = HorizontalAdd_FP32_V256(_mm256_add_ps(ymm_sum_0, ymm_sum_1)); + + switch (last - lhs) { + case 7: + FMA_FP32_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_FP32_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_FP32_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_FP32_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_FP32_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(lhs[0], rhs[0], result) + } + *distance = result; +#else (void)a; (void)b; (void)dim; (void)distance; +#endif // __AVX__ } // Batch version of inner_product_fp32_distance. diff --git a/src/turbo/avx/float32/squared_euclidean.cc b/src/turbo/avx/float32/squared_euclidean.cc index 710738d24..cf72c58be 100644 --- a/src/turbo/avx/float32/squared_euclidean.cc +++ b/src/turbo/avx/float32/squared_euclidean.cc @@ -24,6 +24,74 @@ namespace zvec::turbo::avx { void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX__) + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + dim; + const float *last_aligned = lhs + ((dim >> 4) << 4); + + __m256 ymm_sum_0 = _mm256_setzero_ps(); + __m256 ymm_sum_1 = _mm256_setzero_ps(); + + if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m256 ymm_d_0 = + _mm256_sub_ps(_mm256_load_ps(lhs + 0), _mm256_load_ps(rhs + 0)); + __m256 ymm_d_1 = + _mm256_sub_ps(_mm256_load_ps(lhs + 8), _mm256_load_ps(rhs + 8)); + ymm_sum_0 = _mm256_fmadd_ps(ymm_d_0, ymm_d_0, ymm_sum_0); + ymm_sum_1 = _mm256_fmadd_ps(ymm_d_1, ymm_d_1, ymm_sum_1); + } + + if (last >= last_aligned + 8) { + __m256 ymm_d = _mm256_sub_ps(_mm256_load_ps(lhs), _mm256_load_ps(rhs)); + ymm_sum_0 = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum_0); + lhs += 8; + rhs += 8; + } + } else { + for (; lhs != last_aligned; lhs += 16, rhs += 16) { + __m256 ymm_d_0 = + _mm256_sub_ps(_mm256_loadu_ps(lhs + 0), _mm256_loadu_ps(rhs + 0)); + __m256 ymm_d_1 = + _mm256_sub_ps(_mm256_loadu_ps(lhs + 8), _mm256_loadu_ps(rhs + 8)); + ymm_sum_0 = _mm256_fmadd_ps(ymm_d_0, ymm_d_0, ymm_sum_0); + ymm_sum_1 = _mm256_fmadd_ps(ymm_d_1, ymm_d_1, ymm_sum_1); + } + + if (last >= last_aligned + 8) { + __m256 ymm_d = _mm256_sub_ps(_mm256_loadu_ps(lhs), _mm256_loadu_ps(rhs)); + ymm_sum_0 = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum_0); + lhs += 8; + rhs += 8; + } + } + float result = HorizontalAdd_FP32_V256(_mm256_add_ps(ymm_sum_0, ymm_sum_1)); + + switch (last - lhs) { + case 7: + SSD_FP32_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + SSD_FP32_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + SSD_FP32_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + SSD_FP32_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + SSD_FP32_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + SSD_FP32_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + SSD_FP32_GENERAL(lhs[0], rhs[0], result) + } + + *distance = result; #else (void)a; diff --git a/src/turbo/avx/half_float/common.h b/src/turbo/avx/half_float/common.h deleted file mode 100644 index 13be3a2bf..000000000 --- a/src/turbo/avx/half_float/common.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2025-present the zvec project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - -#pragma once diff --git a/src/turbo/avx/half_float/cosine.cc b/src/turbo/avx/half_float/cosine.cc index ff319539a..beeddb1af 100644 --- a/src/turbo/avx/half_float/cosine.cc +++ b/src/turbo/avx/half_float/cosine.cc @@ -14,6 +14,7 @@ #include "avx/float32/cosine.h" #include "avx/float32/common.h" +#include "avx/float32/inner_product.h" #if defined(__AVX__) #include @@ -24,7 +25,13 @@ namespace zvec::turbo::avx { void cosine_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX__) + constexpr size_t extra_dim = 2; + size_t d = dim - extra_dim; + float ip; + inner_product_fp16_avx(m, q, d, &ip); + + *out = 1 - ip; #else (void)a; (void)b; diff --git a/src/turbo/avx/half_float/euclidean_squared_common.h b/src/turbo/avx/half_float/euclidean_squared_common.h new file mode 100644 index 000000000..696f27d04 --- /dev/null +++ b/src/turbo/avx/half_float/euclidean_squared_common.h @@ -0,0 +1,69 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX__) + +//! Calculate sum of squared difference (AVX) +#define SSD_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ + { \ + __m256 ymm_d = _mm256_sub_ps(ymm_m, ymm_q); \ + ymm_sum = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum); \ + } + +#define ACCUM_FP32_STEP_AVX SSD_FP32_AVX + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_AVX(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, __m256, ymm_sum, _mm256_setzero_ps()) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 4) << 4); \ + if (((uintptr_t)m & 0x1f) == 0 && ((uintptr_t)q & 0x1f) == 0) { \ + for (; q != qe_aligned; m += 16, q += 16) { \ + MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_load_si256, \ + ACCUM_FP32_STEP_AVX) \ + } \ + if (qe >= qe_aligned + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + } else { \ + for (; q != qe_aligned; m += 16, q += 16) { \ + MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_loadu_si256, \ + ACCUM_FP32_STEP_AVX) \ + } \ + if (qe >= qe_aligned + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + } \ + MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \ + *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0)); + +#endif \ No newline at end of file diff --git a/src/turbo/avx/half_float/inner_product.cc b/src/turbo/avx/half_float/inner_product.cc index 707fb12c2..9ab24f12a 100644 --- a/src/turbo/avx/half_float/inner_product.cc +++ b/src/turbo/avx/half_float/inner_product.cc @@ -25,10 +25,14 @@ namespace zvec::turbo::avx { // vector pair. void inner_product_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { +#if defined(__AVX__) + ACCUM_FP16_1X1_AVX(lhs, rhs, size, distance, 0ull, ) +#else (void)a; (void)b; (void)dim; (void)distance; +#endif // __AVX__ } // Batch version of inner_product_fp16_distance. diff --git a/src/turbo/avx/half_float/inner_product_common.h b/src/turbo/avx/half_float/inner_product_common.h new file mode 100644 index 000000000..093de6549 --- /dev/null +++ b/src/turbo/avx/half_float/inner_product_common.h @@ -0,0 +1,66 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX__) + +//! Calculate Fused-Multiply-Add (AVX) +#define FMA_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ + ymm_sum = _mm256_fmadd_ps(ymm_m, ymm_q, ymm_sum); + +#define ACCUM_FP32_STEP_AVX FMA_FP32_AVX + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_AVX(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, __m256, ymm_sum, _mm256_setzero_ps()) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 4) << 4); \ + if (((uintptr_t)m & 0x1f) == 0 && ((uintptr_t)q & 0x1f) == 0) { \ + for (; q != qe_aligned; m += 16, q += 16) { \ + MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_load_si256, \ + ACCUM_FP32_STEP_AVX) \ + } \ + if (qe >= qe_aligned + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_load_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + } else { \ + for (; q != qe_aligned; m += 16, q += 16) { \ + MATRIX_FP16_ITER_1X1_AVX(m, q, ymm_sum, _mm256_loadu_si256, \ + ACCUM_FP32_STEP_AVX) \ + } \ + if (qe >= qe_aligned + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + } \ + MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \ + *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0)); + +#endif \ No newline at end of file diff --git a/src/turbo/avx/half_float/squared_euclidean.cc b/src/turbo/avx/half_float/squared_euclidean.cc index c81bb2e2c..2addf6cb2 100644 --- a/src/turbo/avx/half_float/squared_euclidean.cc +++ b/src/turbo/avx/half_float/squared_euclidean.cc @@ -24,7 +24,7 @@ namespace zvec::turbo::avx { void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX__) - + ACCUM_FP16_1X1_AVX(lhs, rhs, size, distance, 0ull, ) #else (void)a; (void)b; From b6baa8904428d066884df0d0c58388f03fc06322 Mon Sep 17 00:00:00 2001 From: ray Date: Wed, 1 Apr 2026 11:56:04 +0800 Subject: [PATCH 20/44] feat: update ut --- src/turbo/CMakeLists.txt | 2 + src/turbo/avx/float32/inner_product.cc | 2 +- .../avx/half_float/euclidean_squared_common.h | 10 + src/turbo/avx/half_float/inner_product.cc | 9 +- .../avx/half_float/inner_product_common.h | 11 + src/turbo/avx/half_float/squared_euclidean.cc | 9 +- tests/turbo/turbo_cosine_test.cc | 586 +----------------- tests/turbo/turbo_euclidean_test.cc | 126 +--- tests/turbo/turbo_inner_product_test.cc | 184 ++++-- tests/turbo/turbo_quantized_integer_test.cc | 6 + 10 files changed, 172 insertions(+), 773 deletions(-) diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index 6f7416c70..3a8ab6a2a 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -42,6 +42,7 @@ endif() if(NOT ANDROID AND AUTO_DETECT_ARCH) if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") file(GLOB_RECURSE AVX2_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx2/*.cc) + file(GLOB_RECURSE AVX2_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx/*.cc) set_source_files_properties( ${AVX2_SRCS} PROPERTIES @@ -50,6 +51,7 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH) endif() endif() + if(NOT ANDROID AND AUTO_DETECT_ARCH) if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") file(GLOB_RECURSE SSE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/sse/*.cc) diff --git a/src/turbo/avx/float32/inner_product.cc b/src/turbo/avx/float32/inner_product.cc index 9a9a99a6e..3c074e215 100644 --- a/src/turbo/avx/float32/inner_product.cc +++ b/src/turbo/avx/float32/inner_product.cc @@ -21,7 +21,7 @@ namespace zvec::turbo::avx { -// Compute squared Euclidean distance between a single quantized FP32 +// Compute inner product distance between a single quantized FP32 // vector pair. void inner_product_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { diff --git a/src/turbo/avx/half_float/euclidean_squared_common.h b/src/turbo/avx/half_float/euclidean_squared_common.h index 696f27d04..6578f28b9 100644 --- a/src/turbo/avx/half_float/euclidean_squared_common.h +++ b/src/turbo/avx/half_float/euclidean_squared_common.h @@ -24,6 +24,10 @@ #if defined(__AVX__) +#include + +using namespace zvec::ailego; + //! Calculate sum of squared difference (AVX) #define SSD_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ { \ @@ -33,6 +37,12 @@ #define ACCUM_FP32_STEP_AVX SSD_FP32_AVX +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + //! Compute the distance between matrix and query (FP16, M=1, N=1) #define ACCUM_FP16_1X1_AVX(m, q, dim, out, _MASK, _NORM) \ MATRIX_VAR_INIT(1, 1, __m256, ymm_sum, _mm256_setzero_ps()) \ diff --git a/src/turbo/avx/half_float/inner_product.cc b/src/turbo/avx/half_float/inner_product.cc index 9ab24f12a..4836d461d 100644 --- a/src/turbo/avx/half_float/inner_product.cc +++ b/src/turbo/avx/half_float/inner_product.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx/float32/inner_product.h" -#include "avx/float32/common.h" +#include "avx/half_float/inner_product.h" +#include "avx/half_float/inner_product_common.h" #if defined(__AVX__) #include @@ -26,7 +26,10 @@ namespace zvec::turbo::avx { void inner_product_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX__) - ACCUM_FP16_1X1_AVX(lhs, rhs, size, distance, 0ull, ) + const ailego::Float16 *lhs = reinterpret_cast(a); + const ailego::Float16 *rhs = reinterpret_cast(b); + + ACCUM_FP16_1X1_AVX(lhs, rhs, dim, distance, 0ull, ) #else (void)a; (void)b; diff --git a/src/turbo/avx/half_float/inner_product_common.h b/src/turbo/avx/half_float/inner_product_common.h index 093de6549..421bb41b3 100644 --- a/src/turbo/avx/half_float/inner_product_common.h +++ b/src/turbo/avx/half_float/inner_product_common.h @@ -24,12 +24,23 @@ #if defined(__AVX__) +#include + +using namespace zvec::ailego; + //! Calculate Fused-Multiply-Add (AVX) #define FMA_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ ymm_sum = _mm256_fmadd_ps(ymm_m, ymm_q, ymm_sum); #define ACCUM_FP32_STEP_AVX FMA_FP32_AVX +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + //! Compute the distance between matrix and query (FP16, M=1, N=1) #define ACCUM_FP16_1X1_AVX(m, q, dim, out, _MASK, _NORM) \ MATRIX_VAR_INIT(1, 1, __m256, ymm_sum, _mm256_setzero_ps()) \ diff --git a/src/turbo/avx/half_float/squared_euclidean.cc b/src/turbo/avx/half_float/squared_euclidean.cc index 2addf6cb2..a3f894a95 100644 --- a/src/turbo/avx/half_float/squared_euclidean.cc +++ b/src/turbo/avx/half_float/squared_euclidean.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx/float32/squared_euclidean.h" -#include "avx/float32/common.h" +#include "avx/half_float/squared_euclidean.h" +#include "avx/half_float/euclidean_squared_common.h" #if defined(__AVX__) #include @@ -24,7 +24,10 @@ namespace zvec::turbo::avx { void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX__) - ACCUM_FP16_1X1_AVX(lhs, rhs, size, distance, 0ull, ) + const ailego::Float16 *lhs = reinterpret_cast(a); + const ailego::Float16 *rhs = reinterpret_cast(b); + + ACCUM_FP16_1X1_AVX(lhs, rhs, dim, distance, 0ull, ) #else (void)a; (void)b; diff --git a/tests/turbo/turbo_cosine_test.cc b/tests/turbo/turbo_cosine_test.cc index ce7ce94d0..83debae27 100644 --- a/tests/turbo/turbo_cosine_test.cc +++ b/tests/turbo/turbo_cosine_test.cc @@ -21,588 +21,6 @@ using namespace zvec; using namespace zvec::core; using namespace zvec::ailego; -#if 0 -static void Norm2(std::vector &vec, std::string *out) { - float norm = 0.0f; +TEST(CosineMetric, TestFp32Cosine) {} - out->resize(vec.size() * sizeof(Float16) + sizeof(float)); - - Norm2Matrix::Compute(vec.data(), vec.size(), &norm); - - Float16 *buf = reinterpret_cast(&(*out)[0]); - - for (uint32_t i = 0; i < vec.size(); ++i) { - buf[i] = vec[i] / norm; - } - - float *norm_buf = - reinterpret_cast(&(*out)[vec.size() * sizeof(Float16)]); - - memcpy(norm_buf, &norm, sizeof(float)); -} - -static void Norm2(std::vector &vec, std::string *out) { - float norm = 0.0f; - - out->resize((vec.size() + 1) * sizeof(float)); - - Norm2Matrix::Compute(vec.data(), vec.size(), &norm); - - float *buf = reinterpret_cast(&(*out)[0]); - for (uint32_t i = 0; i < vec.size(); ++i) { - buf[i] = vec[i] / norm; - } - - buf[vec.size()] = norm; -} - -static size_t ExtraDimension(IndexMeta::DataType type) { - // The extra quantized params storage size to save for each vector - if (type == IndexMeta::DT_FP32) return 1; - if (type == IndexMeta::DT_FP16) return 2; - - return 0; -} - -TEST(CosineMeasure_General_Test, General) { - auto measure = IndexFactory::CreateMetric("Cosine"); - EXPECT_TRUE(measure); - - IndexMeta meta; - meta.set_meta(IndexMeta::DT_INT16, 64); - ASSERT_NE(0, measure->init(meta, Params())); - meta.set_meta(IndexMeta::DT_FP16, 64); - ASSERT_EQ(0, measure->init(meta, Params())); - meta.set_meta(IndexMeta::DT_FP32, 64); - ASSERT_EQ(0, measure->init(meta, Params())); - meta.set_meta(IndexMeta::DT_INT8, 64); - ASSERT_NE(0, measure->init(meta, Params())); - - meta.set_meta(IndexMeta::DT_BINARY32, 64); - ASSERT_NE(0, measure->init(meta, Params())); - meta.set_meta(IndexMeta::DT_BINARY64, 64); - ASSERT_NE(0, measure->init(meta, Params())); - meta.set_meta(IndexMeta::DT_INT4, 64); - ASSERT_NE(0, measure->init(meta, Params())); - - IndexMeta meta2; - meta2.set_meta(IndexMeta::DT_BINARY32, 64); - EXPECT_FALSE(measure->is_matched(meta2)); - EXPECT_TRUE( - measure->is_matched(meta, IndexQueryMeta(IndexMeta::DT_FP32, 64))); - EXPECT_FALSE( - measure->is_matched(meta, IndexQueryMeta(IndexMeta::DT_FP32, 63))); - - EXPECT_FALSE(measure->distance_matrix(0, 0)); - EXPECT_FALSE(measure->distance_matrix(3, 5)); - EXPECT_FALSE(measure->distance_matrix(31, 65)); - EXPECT_TRUE(measure->distance_matrix(1, 1)); - EXPECT_FALSE(measure->distance_matrix(2, 1)); - EXPECT_FALSE(measure->distance_matrix(2, 2)); - EXPECT_FALSE(measure->distance_matrix(4, 1)); - EXPECT_FALSE(measure->distance_matrix(4, 2)); - EXPECT_FALSE(measure->distance_matrix(4, 4)); - EXPECT_FALSE(measure->distance_matrix(8, 1)); - EXPECT_FALSE(measure->distance_matrix(8, 2)); - EXPECT_FALSE(measure->distance_matrix(8, 4)); - EXPECT_FALSE(measure->distance_matrix(8, 8)); - EXPECT_FALSE(measure->distance_matrix(16, 1)); - EXPECT_FALSE(measure->distance_matrix(16, 2)); - EXPECT_FALSE(measure->distance_matrix(16, 4)); - EXPECT_FALSE(measure->distance_matrix(16, 8)); - EXPECT_FALSE(measure->distance_matrix(16, 16)); - EXPECT_FALSE(measure->distance_matrix(32, 1)); - EXPECT_FALSE(measure->distance_matrix(32, 2)); - EXPECT_FALSE(measure->distance_matrix(32, 4)); - EXPECT_FALSE(measure->distance_matrix(32, 8)); - EXPECT_FALSE(measure->distance_matrix(32, 16)); - EXPECT_FALSE(measure->distance_matrix(32, 32)); - - EXPECT_FALSE(measure->support_normalize()); - float result = 1.0f; - measure->normalize(&result); - EXPECT_FLOAT_EQ(1.0f, result); -} - -TEST(CosineMeasure_General_Test, TestDistanceFp32) { - { - constexpr uint32_t dimension = 2; - IndexMeta meta; - meta.set_meta(IndexMeta::DT_FP32, dimension); - - auto measure = IndexFactory::CreateMetric("Cosine"); - ASSERT_TRUE(measure); - Params params; - ASSERT_EQ(0, measure->init(meta, params)); - ASSERT_EQ(false, measure->support_train()); - - auto distance = measure->distance(); - ASSERT_NE(distance, nullptr); - auto dist_matrix = measure->distance_matrix(1, 1); - ASSERT_NE(dist_matrix, nullptr); - - std::vector a = {0.2f, 0.9f}; - std::vector b = {0.3f, 0.5f}; - - std::string a_out; - std::string b_out; - - Norm2(a, &a_out); - Norm2(b, &b_out); - - float result = 0.0f; - distance(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP32), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.00001f, std::abs(result - 0.05131668f)); - - dist_matrix(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP32), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.00001f, std::abs(result - 0.05131668f)); - } - - { - constexpr uint32_t dimension = 3; - IndexMeta meta; - meta.set_meta(IndexMeta::DT_FP32, dimension); - - auto measure = IndexFactory::CreateMetric("Cosine"); - ASSERT_TRUE(measure); - Params params; - ASSERT_EQ(0, measure->init(meta, params)); - ASSERT_EQ(false, measure->support_train()); - - auto distance = measure->distance(); - ASSERT_NE(distance, nullptr); - auto dist_matrix = measure->distance_matrix(1, 1); - ASSERT_NE(dist_matrix, nullptr); - - std::vector a = {0.2f, 0.9f, 0.6f}; - std::vector b = {0.3f, 0.5f, 0.7f}; - - std::string a_out; - std::string b_out; - - Norm2(a, &a_out); - Norm2(b, &b_out); - - float result = 0.0f; - distance(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP32), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.00001f, std::abs(result - 0.07199293f)); - - dist_matrix(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP32), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.00001f, std::abs(result - 0.07199293f)); - } - - { - constexpr uint32_t dimension = 11; - IndexMeta meta; - meta.set_meta(IndexMeta::DT_FP32, dimension); - - auto measure = IndexFactory::CreateMetric("Cosine"); - ASSERT_TRUE(measure); - Params params; - ASSERT_EQ(0, measure->init(meta, params)); - ASSERT_EQ(false, measure->support_train()); - - auto distance = measure->distance(); - ASSERT_NE(distance, nullptr); - auto dist_matrix = measure->distance_matrix(1, 1); - ASSERT_NE(dist_matrix, nullptr); - - std::vector a = {1.0f, 2.0f, 3.0f, 0.2f, 0.3f, 0.1f, - 5.2f, 2.1f, 7.1f, 6.8f, 1.2f}; - std::vector b = {2.0f, 4.0f, 6.0f, 0.6f, 0.7f, 0.9f, - 1.0f, 2.3f, 3.4f, 4.5f, 6.4f}; - - - std::string a_out; - std::string b_out; - - Norm2(a, &a_out); - Norm2(b, &b_out); - - float result = 0.0f; - distance(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP32), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.00001f, std::abs(result - 0.2803060f)); - - dist_matrix(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP32), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.00001f, std::abs(result - 0.2803060f)); - } -} - -TEST(CosineMeasure_General_Test, TestDistanceFp16) { - { - constexpr uint32_t dimension = 2; - IndexMeta meta; - meta.set_meta(IndexMeta::DT_FP16, dimension); - - auto measure = IndexFactory::CreateMetric("Cosine"); - ASSERT_TRUE(measure); - Params params; - ASSERT_EQ(0, measure->init(meta, params)); - ASSERT_EQ(false, measure->support_train()); - - auto distance = measure->distance(); - ASSERT_NE(distance, nullptr); - auto dist_matrix = measure->distance_matrix(1, 1); - ASSERT_NE(dist_matrix, nullptr); - - std::vector a = {0.2f, 0.9f}; - std::vector b = {0.3f, 0.5f}; - - std::string a_out; - std::string b_out; - - Norm2(a, &a_out); - Norm2(b, &b_out); - - float result = 0.0f; - distance(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP16), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.001f, std::abs(result - 0.05131668f)); - - dist_matrix(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP16), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.001f, std::abs(result - 0.05131668f)); - } - - { - constexpr uint32_t dimension = 3; - IndexMeta meta; - meta.set_meta(IndexMeta::DT_FP16, dimension); - - auto measure = IndexFactory::CreateMetric("Cosine"); - ASSERT_TRUE(measure); - Params params; - ASSERT_EQ(0, measure->init(meta, params)); - ASSERT_EQ(false, measure->support_train()); - - auto distance = measure->distance(); - ASSERT_NE(distance, nullptr); - auto dist_matrix = measure->distance_matrix(1, 1); - ASSERT_NE(dist_matrix, nullptr); - - std::vector a = {0.2f, 0.9f, 0.6f}; - std::vector b = {0.3f, 0.5f, 0.7f}; - - std::string a_out; - std::string b_out; - - Norm2(a, &a_out); - Norm2(b, &b_out); - - float result = 0.0f; - distance(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP16), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.001f, std::abs(result - 0.07199293f)); - - dist_matrix(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP16), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.001f, std::abs(result - 0.07199293f)); - } - - { - constexpr uint32_t dimension = 11; - IndexMeta meta; - meta.set_meta(IndexMeta::DT_FP16, dimension); - - auto measure = IndexFactory::CreateMetric("Cosine"); - ASSERT_TRUE(measure); - Params params; - ASSERT_EQ(0, measure->init(meta, params)); - ASSERT_EQ(false, measure->support_train()); - - auto distance = measure->distance(); - ASSERT_NE(distance, nullptr); - auto dist_matrix = measure->distance_matrix(1, 1); - ASSERT_NE(dist_matrix, nullptr); - - std::vector a = {1.0f, 2.0f, 3.0f, 0.2f, 0.3f, 0.1f, - 5.2f, 2.1f, 7.1f, 6.8f, 1.2f}; - std::vector b = {2.0f, 4.0f, 6.0f, 0.6f, 0.7f, 0.9f, - 1.0f, 2.3f, 3.4f, 4.5f, 6.4f}; - - std::string a_out; - std::string b_out; - - Norm2(a, &a_out); - Norm2(b, &b_out); - - float result = 0.0f; - dist_matrix(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP16), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.001f, std::abs(result - 0.2803060f)); - - dist_matrix(a_out.data(), b_out.data(), - dimension + ExtraDimension(IndexMeta::DT_FP16), &result); - - if (measure->support_normalize()) { - measure->normalize(&result); - } - - EXPECT_GE(0.001f, std::abs(result - 0.2803060f)); - } -} - -TEST(CosineMeasure_General_Test, TestDistanceBatchFp16Simple) { - { - constexpr uint32_t dimension = 2; - IndexMeta meta; - meta.set_meta(IndexMeta::DT_FP16, dimension); - - auto measure = IndexFactory::CreateMetric("Cosine"); - ASSERT_TRUE(measure); - Params params; - ASSERT_EQ(0, measure->init(meta, params)); - ASSERT_EQ(false, measure->support_train()); - - auto dist_batch = measure->batch_distance(); - ASSERT_NE(dist_batch, nullptr); - - std::vector a = {0.2f, 0.9f}; - std::vector b = {0.3f, 0.5f}; - - std::string a_out; - std::string b_out; - - - Norm2(a, &a_out); - Norm2(b, &b_out); - - float results[2] = {0.0f, 0.0f}; - - const void *vecs[2]; - vecs[0] = a_out.data(); - vecs[1] = b_out.data(); - dist_batch(vecs, b_out.data(), 2, - dimension + ExtraDimension(IndexMeta::DT_FP16), results); - - if (measure->support_normalize()) { - measure->normalize(&results[0]); - measure->normalize(&results[1]); - } - - EXPECT_GE(0.001f, std::abs(results[0] - 0.05131668f)); - EXPECT_GE(0.001f, std::abs(results[1] - 0.0f)); - } -} - -TEST(CosineMeasure_General_Test, TestDistanceBatchFp32Simple) { - { - constexpr uint32_t dimension = 2; - IndexMeta meta; - meta.set_meta(IndexMeta::DT_FP32, dimension); - - auto measure = IndexFactory::CreateMetric("Cosine"); - ASSERT_TRUE(measure); - Params params; - ASSERT_EQ(0, measure->init(meta, params)); - ASSERT_EQ(false, measure->support_train()); - - auto dist_batch = measure->batch_distance(); - ASSERT_NE(dist_batch, nullptr); - - std::vector a = {0.2f, 0.9f}; - std::vector b = {0.3f, 0.5f}; - - std::string a_out; - std::string b_out; - - Norm2(a, &a_out); - Norm2(b, &b_out); - - float results[2] = {0.0f, 0.0f}; - - const void *vecs[2]; - vecs[0] = a_out.data(); - vecs[1] = b_out.data(); - dist_batch(vecs, b_out.data(), 2, - dimension + ExtraDimension(IndexMeta::DT_FP32), results); - - if (measure->support_normalize()) { - measure->normalize(&results[0]); - measure->normalize(&results[1]); - } - - EXPECT_GE(0.00001f, std::abs(results[0] - 0.05131668f)); - EXPECT_GE(0.00001f, std::abs(results[1] - 0.0f)); - } -} - -template -void calculate_distance(std::vector &a, std::vector &b, size_t dimension, - IndexMeta::DataType data_type, size_t batch_size, - float expected_distance, float epsilon = 0.00001f) { - IndexMeta meta; - meta.set_meta(data_type, dimension); - - auto measure = IndexFactory::CreateMetric("Cosine"); - ASSERT_TRUE(measure); - Params params; - ASSERT_EQ(0, measure->init(meta, params)); - ASSERT_EQ(false, measure->support_train()); - - auto dist_batch = measure->batch_distance(); - ASSERT_NE(dist_batch, nullptr); - - std::string a_out; - std::string b_out; - - Norm2(a, &a_out); - Norm2(b, &b_out); - - float results[2] = {0.0f, 0.0f}; - - const void *vecs[2]; - vecs[0] = a_out.data(); - vecs[1] = b_out.data(); - dist_batch(vecs, b_out.data(), batch_size, - dimension + ExtraDimension(data_type), results); - - if (measure->support_normalize()) { - measure->normalize(&results[0]); - measure->normalize(&results[1]); - } - - EXPECT_GE(epsilon, std::abs(results[0] - expected_distance)); - EXPECT_GE(epsilon, std::abs(results[1] - 0.0f)); -} - - -TEST(CosineMeasure_General_Test, TestDistanceBatch) { - { - constexpr uint32_t dimension = 2; - - { - std::vector a = {0.2f, 0.9f}; - std::vector b = {0.3f, 0.5f}; - - calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 1, 0.05131668f, - 0.00001f); - calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 2, 0.05131668f, - 0.00001f); - } - { - std::vector a = {0.2f, 0.9f}; - std::vector b = {0.3f, 0.5f}; - - calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 1, 0.05131668f, - 0.001f); - calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 2, 0.05131668f, - 0.001f); - } - } - - { - constexpr uint32_t dimension = 3; - - - { - std::vector a = {0.2f, 0.9f, 0.6f}; - std::vector b = {0.3f, 0.5f, 0.7f}; - - calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 1, 0.07199293f, - 0.00001f); - calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 2, 0.07199293f, - 0.00001f); - } - { - std::vector a = {0.2f, 0.9f, 0.6f}; - std::vector b = {0.3f, 0.5f, 0.7f}; - - calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 1, 0.07199293f, - 0.001f); - calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 2, 0.07199293f, - 0.001f); - } - } - - { - constexpr uint32_t dimension = 11; - - { - std::vector a = {1.0f, 2.0f, 3.0f, 0.2f, 0.3f, 0.1f, - 5.2f, 2.1f, 7.1f, 6.8f, 1.2f}; - std::vector b = {2.0f, 4.0f, 6.0f, 0.6f, 0.7f, 0.9f, - 1.0f, 2.3f, 3.4f, 4.5f, 6.4f}; - - calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 1, 0.2803060f, - 0.00001f); - calculate_distance(a, b, dimension, IndexMeta::DT_FP32, 2, 0.2803060f, - 0.00001f); - } - - { - std::vector a = {1.0f, 2.0f, 3.0f, 0.2f, 0.3f, 0.1f, - 5.2f, 2.1f, 7.1f, 6.8f, 1.2f}; - std::vector b = {2.0f, 4.0f, 6.0f, 0.6f, 0.7f, 0.9f, - 1.0f, 2.3f, 3.4f, 4.5f, 6.4f}; - - calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 1, 0.2803060f, - 0.001f); - calculate_distance(a, b, dimension, IndexMeta::DT_FP16, 2, 0.2803060f, - 0.001f); - } - } -} - -#endif \ No newline at end of file +TEST(CosineMetric, TestFp16Cosine) {} diff --git a/tests/turbo/turbo_euclidean_test.cc b/tests/turbo/turbo_euclidean_test.cc index 644ee46d0..016cdc585 100644 --- a/tests/turbo/turbo_euclidean_test.cc +++ b/tests/turbo/turbo_euclidean_test.cc @@ -18,128 +18,6 @@ using namespace zvec; using namespace zvec::core; -#if 0 -TEST(SquaredEuclideanMetric, General) { - auto metric = IndexFactory::CreateMetric("SquaredEuclidean"); - EXPECT_TRUE(metric); +TEST(SquaredEuclideanMetric, TestFp32SquaredEuclidean) {} - IndexMeta meta; - meta.set_meta(IndexMeta::DataType::DT_INT16, 64); - ASSERT_NE(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_BINARY32, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_BINARY64, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_FP16, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_FP32, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_INT4, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_INT8, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - - IndexMeta meta2; - meta2.set_meta(IndexMeta::DataType::DT_BINARY32, 64); - EXPECT_TRUE(metric->is_matched(meta)); - EXPECT_FALSE(metric->is_matched(meta2)); - EXPECT_TRUE(metric->is_matched( - meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 64))); - EXPECT_FALSE(metric->is_matched( - meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 63))); - - EXPECT_FALSE(metric->distance_matrix(0, 0)); - EXPECT_FALSE(metric->distance_matrix(3, 5)); - EXPECT_FALSE(metric->distance_matrix(31, 65)); - EXPECT_TRUE(metric->distance_matrix(1, 1)); - EXPECT_TRUE(metric->distance_matrix(2, 1)); - EXPECT_TRUE(metric->distance_matrix(2, 2)); - EXPECT_TRUE(metric->distance_matrix(4, 1)); - EXPECT_TRUE(metric->distance_matrix(4, 2)); - EXPECT_TRUE(metric->distance_matrix(4, 4)); - EXPECT_TRUE(metric->distance_matrix(8, 1)); - EXPECT_TRUE(metric->distance_matrix(8, 2)); - EXPECT_TRUE(metric->distance_matrix(8, 4)); - EXPECT_TRUE(metric->distance_matrix(8, 8)); - EXPECT_FALSE(metric->distance_matrix(8, 32)); - EXPECT_FALSE(metric->distance_matrix(8, 9)); - EXPECT_TRUE(metric->distance_matrix(16, 1)); - EXPECT_TRUE(metric->distance_matrix(16, 2)); - EXPECT_TRUE(metric->distance_matrix(16, 4)); - EXPECT_TRUE(metric->distance_matrix(16, 8)); - EXPECT_TRUE(metric->distance_matrix(16, 16)); - EXPECT_FALSE(metric->distance_matrix(16, 17)); - EXPECT_TRUE(metric->distance_matrix(32, 1)); - EXPECT_TRUE(metric->distance_matrix(32, 2)); - EXPECT_TRUE(metric->distance_matrix(32, 4)); - EXPECT_TRUE(metric->distance_matrix(32, 8)); - EXPECT_TRUE(metric->distance_matrix(32, 16)); - EXPECT_TRUE(metric->distance_matrix(32, 32)); - - EXPECT_FALSE(metric->support_normalize()); - float result = 1.0f; - metric->normalize(&result); - EXPECT_FLOAT_EQ(1.0f, result); -} - -TEST(EuclideanMetric, General) { - auto metric = IndexFactory::CreateMetric("Euclidean"); - EXPECT_TRUE(metric); - - IndexMeta meta; - meta.set_meta(IndexMeta::DataType::DT_INT16, 64); - ASSERT_NE(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_BINARY32, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_BINARY64, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_FP16, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_FP32, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_INT4, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_INT8, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - - IndexMeta meta2; - meta2.set_meta(IndexMeta::DataType::DT_BINARY32, 64); - EXPECT_TRUE(metric->is_matched(meta)); - EXPECT_FALSE(metric->is_matched(meta2)); - EXPECT_TRUE(metric->is_matched( - meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 64))); - EXPECT_FALSE(metric->is_matched( - meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 63))); - - EXPECT_FALSE(metric->distance_matrix(0, 0)); - EXPECT_FALSE(metric->distance_matrix(3, 5)); - EXPECT_FALSE(metric->distance_matrix(31, 65)); - EXPECT_TRUE(metric->distance_matrix(1, 1)); - EXPECT_TRUE(metric->distance_matrix(2, 1)); - EXPECT_TRUE(metric->distance_matrix(2, 2)); - EXPECT_TRUE(metric->distance_matrix(4, 1)); - EXPECT_TRUE(metric->distance_matrix(4, 2)); - EXPECT_TRUE(metric->distance_matrix(4, 4)); - EXPECT_TRUE(metric->distance_matrix(8, 1)); - EXPECT_TRUE(metric->distance_matrix(8, 2)); - EXPECT_TRUE(metric->distance_matrix(8, 4)); - EXPECT_TRUE(metric->distance_matrix(8, 8)); - EXPECT_TRUE(metric->distance_matrix(16, 1)); - EXPECT_TRUE(metric->distance_matrix(16, 2)); - EXPECT_TRUE(metric->distance_matrix(16, 4)); - EXPECT_TRUE(metric->distance_matrix(16, 8)); - EXPECT_TRUE(metric->distance_matrix(16, 16)); - EXPECT_TRUE(metric->distance_matrix(32, 1)); - EXPECT_TRUE(metric->distance_matrix(32, 2)); - EXPECT_TRUE(metric->distance_matrix(32, 4)); - EXPECT_TRUE(metric->distance_matrix(32, 8)); - EXPECT_TRUE(metric->distance_matrix(32, 16)); - EXPECT_TRUE(metric->distance_matrix(32, 32)); - - EXPECT_FALSE(metric->support_normalize()); - float result = 1.0f; - metric->normalize(&result); - EXPECT_FLOAT_EQ(1.0f, result); -} - -#endif \ No newline at end of file +TEST(SquaredEuclideanMetric, TestFp16SquaredEuclidean) {} diff --git a/tests/turbo/turbo_inner_product_test.cc b/tests/turbo/turbo_inner_product_test.cc index 0ec1b567e..d5ef7df49 100644 --- a/tests/turbo/turbo_inner_product_test.cc +++ b/tests/turbo/turbo_inner_product_test.cc @@ -13,68 +13,136 @@ // limitations under the License. #include #include +#include +#include #include "zvec/core/framework/index_factory.h" using namespace zvec; using namespace zvec::core; +using namespace zvec::ailego; -#if 0 -TEST(InnerProductMetric, General) { - auto metric = IndexFactory::CreateMetric("InnerProduct"); - ASSERT_TRUE(metric); - - IndexMeta meta; - meta.set_meta(IndexMeta::DataType::DT_BINARY32, 64); - ASSERT_NE(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_BINARY64, 64); - ASSERT_NE(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_FP16, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_FP32, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_INT4, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - meta.set_meta(IndexMeta::DataType::DT_INT8, 64); - ASSERT_EQ(0, metric->init(meta, ailego::Params())); - - IndexMeta meta2; - meta2.set_meta(IndexMeta::DataType::DT_BINARY32, 64); - EXPECT_TRUE(metric->is_matched(meta)); - EXPECT_FALSE(metric->is_matched(meta2)); - EXPECT_TRUE(metric->is_matched( - meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 64))); - EXPECT_FALSE(metric->is_matched( - meta, IndexQueryMeta(IndexMeta::DataType::DT_INT8, 63))); - - EXPECT_FALSE(metric->distance_matrix(0, 0)); - EXPECT_FALSE(metric->distance_matrix(3, 5)); - EXPECT_FALSE(metric->distance_matrix(31, 65)); - EXPECT_TRUE(metric->distance_matrix(1, 1)); - EXPECT_TRUE(metric->distance_matrix(2, 1)); - EXPECT_TRUE(metric->distance_matrix(2, 2)); - EXPECT_TRUE(metric->distance_matrix(4, 1)); - EXPECT_TRUE(metric->distance_matrix(4, 2)); - EXPECT_TRUE(metric->distance_matrix(4, 4)); - EXPECT_TRUE(metric->distance_matrix(8, 1)); - EXPECT_TRUE(metric->distance_matrix(8, 2)); - EXPECT_TRUE(metric->distance_matrix(8, 4)); - EXPECT_TRUE(metric->distance_matrix(8, 8)); - EXPECT_TRUE(metric->distance_matrix(16, 1)); - EXPECT_TRUE(metric->distance_matrix(16, 2)); - EXPECT_TRUE(metric->distance_matrix(16, 4)); - EXPECT_TRUE(metric->distance_matrix(16, 8)); - EXPECT_TRUE(metric->distance_matrix(16, 16)); - EXPECT_TRUE(metric->distance_matrix(32, 1)); - EXPECT_TRUE(metric->distance_matrix(32, 2)); - EXPECT_TRUE(metric->distance_matrix(32, 4)); - EXPECT_TRUE(metric->distance_matrix(32, 8)); - EXPECT_TRUE(metric->distance_matrix(32, 16)); - EXPECT_TRUE(metric->distance_matrix(32, 32)); - - EXPECT_TRUE(metric->support_normalize()); - float result = 1.0f; - metric->normalize(&result); - EXPECT_FLOAT_EQ(-1.0f, result); +// Target Test Type: avx, avx512, scalar +TEST(InnerProductMetric, TestFp32InnerProduct) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1000; + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + float score_scalar{0.0f}; + float score_avx{0.0f}; + float score_avx512{0.0f}; + + func_scalar(doc_vec.data(), query_vec.data(), DIMENSION, &score_scalar); + + func_avx512(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx512); + + func_avx(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx); + + ASSERT_NEAR(score_scalar, score_avx512, 0.001); + ASSERT_NEAR(score_scalar, score_avx, 0.001); + } } -#endif \ No newline at end of file +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(InnerProductMetric, TestFp16InnerProduct) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1000; + + auto converter = IndexFactory::CreateConverter("HalfFloatConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_avx512fp16 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_avx512fp16{0.0f}; + float score_avx512{0.0f}; + float score_avx{0.0f}; + float score_scalar{0.0f}; + + func_avx512fp16(doc_out.data(), query_out.data(), + qmeta_reformer.dimension(), &score_avx512fp16); + + func_avx512(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx512); + + func_avx(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + ASSERT_NEAR(score_scalar, score_avx512fp16, 0.001); + ASSERT_NEAR(score_scalar, score_avx512, 0.001); + ASSERT_NEAR(score_scalar, score_avx, 0.001); + } +} diff --git a/tests/turbo/turbo_quantized_integer_test.cc b/tests/turbo/turbo_quantized_integer_test.cc index 8d09f97cd..2419eb7cb 100644 --- a/tests/turbo/turbo_quantized_integer_test.cc +++ b/tests/turbo/turbo_quantized_integer_test.cc @@ -26,6 +26,7 @@ using namespace zvec; using namespace zvec::core; using namespace zvec::ailego; +// Target Test Type: avx2, sse, scalar TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); @@ -106,6 +107,7 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { } } +// Target Test Type: avx2, sse, scalar TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); @@ -186,6 +188,7 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { } } +// Target Test Type: avx2, sse, scalar TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); @@ -265,6 +268,7 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { } } +// Target Test Type: avx2, sse, scalar TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); @@ -344,6 +348,7 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { } } +// Target Test Type: avx2, sse, scalar TEST(QuantizedIntegerMetric, TestInt8Cosine) { std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); @@ -450,6 +455,7 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) { } } +// Target Test Type: avx2, sse, scalar TEST(QuantizedIntegerMetric, TestInt4Cosine) { std::mt19937 gen(15583); std::uniform_real_distribution dist(-1.0, 2.0); From 83b172c41d4f87db977950550ba7c271b6b9001d Mon Sep 17 00:00:00 2001 From: ray Date: Thu, 2 Apr 2026 11:53:33 +0800 Subject: [PATCH 21/44] feat: add dist ut --- src/turbo/avx/float32/common.h | 23 ++++ src/turbo/avx/float32/cosine.cc | 4 +- src/turbo/avx/float32/inner_product.cc | 3 +- src/turbo/avx/float32/squared_euclidean.cc | 1 + src/turbo/avx/half_float/cosine.cc | 10 +- .../avx/half_float/euclidean_squared_common.h | 110 ++++++++++++++++++ src/turbo/avx/half_float/inner_product.h | 8 +- .../avx/half_float/inner_product_common.h | 110 +++++++++++++++++- 8 files changed, 256 insertions(+), 13 deletions(-) diff --git a/src/turbo/avx/float32/common.h b/src/turbo/avx/float32/common.h index 13be3a2bf..6d3f91d12 100644 --- a/src/turbo/avx/float32/common.h +++ b/src/turbo/avx/float32/common.h @@ -21,3 +21,26 @@ // overhead. #pragma once + +#if defined(__AVX__) + +#include + +#define SSD_FP32_GENERAL(m, q, sum) \ + { \ + float x = m - q; \ + sum += (x * x); \ + } + +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_FP32_GENERAL(m, q, sum) sum += (m * q); + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +#endif \ No newline at end of file diff --git a/src/turbo/avx/float32/cosine.cc b/src/turbo/avx/float32/cosine.cc index a05ba5e39..42e858df3 100644 --- a/src/turbo/avx/float32/cosine.cc +++ b/src/turbo/avx/float32/cosine.cc @@ -29,9 +29,9 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, size_t d = dim - extra_dim; float ip; - inner_product_fp32_avx(m, q, d, &ip); + inner_product_fp32_distance(a, b, d, &ip); - *out = 1 - ip; + *distance = 1 - ip; #else (void)a; (void)b; diff --git a/src/turbo/avx/float32/inner_product.cc b/src/turbo/avx/float32/inner_product.cc index 3c074e215..7e379721d 100644 --- a/src/turbo/avx/float32/inner_product.cc +++ b/src/turbo/avx/float32/inner_product.cc @@ -17,6 +17,7 @@ #if defined(__AVX__) #include +#include #endif namespace zvec::turbo::avx { @@ -29,7 +30,7 @@ void inner_product_fp32_distance(const void *a, const void *b, size_t dim, const float *lhs = reinterpret_cast(a); const float *rhs = reinterpret_cast(b); - const float *last = lhs + size; + const float *last = lhs + dim; const float *last_aligned = lhs + ((dim >> 4) << 4); __m256 ymm_sum_0 = _mm256_setzero_ps(); diff --git a/src/turbo/avx/float32/squared_euclidean.cc b/src/turbo/avx/float32/squared_euclidean.cc index cf72c58be..a74856b60 100644 --- a/src/turbo/avx/float32/squared_euclidean.cc +++ b/src/turbo/avx/float32/squared_euclidean.cc @@ -17,6 +17,7 @@ #if defined(__AVX__) #include +#include #endif namespace zvec::turbo::avx { diff --git a/src/turbo/avx/half_float/cosine.cc b/src/turbo/avx/half_float/cosine.cc index beeddb1af..40ac05853 100644 --- a/src/turbo/avx/half_float/cosine.cc +++ b/src/turbo/avx/half_float/cosine.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx/float32/cosine.h" -#include "avx/float32/common.h" -#include "avx/float32/inner_product.h" +#include "avx/half_float/cosine.h" +#include "avx/half_float/inner_product.h" +#include "avx/half_float/inner_product_common.h" #if defined(__AVX__) #include @@ -29,9 +29,9 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, size_t d = dim - extra_dim; float ip; - inner_product_fp16_avx(m, q, d, &ip); + cosine_fp16_distance(a, b, d, &ip); - *out = 1 - ip; + *distance = 1 - ip; #else (void)a; (void)b; diff --git a/src/turbo/avx/half_float/euclidean_squared_common.h b/src/turbo/avx/half_float/euclidean_squared_common.h index 6578f28b9..0e667a66b 100644 --- a/src/turbo/avx/half_float/euclidean_squared_common.h +++ b/src/turbo/avx/half_float/euclidean_squared_common.h @@ -24,10 +24,105 @@ #if defined(__AVX__) +#include #include using namespace zvec::ailego; +namespace zvec::turbo::avx { + + +//! Mask process of computing distance (FP16) +#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC) \ + switch (cnt) { \ + case 7: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(lhs) + 6), \ + *((const short *)(lhs) + 5), *((const short *)(lhs) + 4), \ + *((const short *)(lhs) + 3), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(rhs) + 6), \ + *((const short *)(rhs) + 5), *((const short *)(rhs) + 4), \ + *((const short *)(rhs) + 3), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 6: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2), \ + *((const int *)(lhs) + 1), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2), \ + *((const int *)(rhs) + 1), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 5: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(lhs) + 4), *((const short *)(lhs) + 3), \ + *((const short *)(lhs) + 2), *((const short *)(lhs) + 1), \ + *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(rhs) + 4), *((const short *)(rhs) + 3), \ + *((const short *)(rhs) + 2), *((const short *)(rhs) + 1), \ + *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 4: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 3: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 2: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 1: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + } + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + //! Calculate sum of squared difference (AVX) #define SSD_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ { \ @@ -43,6 +138,19 @@ using namespace zvec::ailego; #define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_AVX(m, q, _RES, _LOAD, _PROC) \ + { \ + __m256i ymm_mi = _LOAD((const __m256i *)m); \ + __m256i ymm_qi = _LOAD((const __m256i *)q); \ + __m256 ymm_m = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_mi)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_qi)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + ymm_m = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_mi, 1)); \ + ymm_q = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_qi, 1)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + } + //! Compute the distance between matrix and query (FP16, M=1, N=1) #define ACCUM_FP16_1X1_AVX(m, q, dim, out, _MASK, _NORM) \ MATRIX_VAR_INIT(1, 1, __m256, ymm_sum, _mm256_setzero_ps()) \ @@ -76,4 +184,6 @@ using namespace zvec::ailego; MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \ *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0)); +} // namespace zvec::turbo::avx + #endif \ No newline at end of file diff --git a/src/turbo/avx/half_float/inner_product.h b/src/turbo/avx/half_float/inner_product.h index 083a35f6f..08b5a8d73 100644 --- a/src/turbo/avx/half_float/inner_product.h +++ b/src/turbo/avx/half_float/inner_product.h @@ -18,13 +18,13 @@ namespace zvec::turbo::avx { -// Compute inner product distance between a single quantized FP32 +// Compute inner product distance between a single quantized FP16 // vector pair. -void inner_product_fp32_distance(const void *a, const void *b, size_t dim, +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, float *distance); -// Batch version of inner_product_fp32_distance. -void inner_product_fp32_batch_distance(const void *const *vectors, +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); diff --git a/src/turbo/avx/half_float/inner_product_common.h b/src/turbo/avx/half_float/inner_product_common.h index 421bb41b3..f8f5f377d 100644 --- a/src/turbo/avx/half_float/inner_product_common.h +++ b/src/turbo/avx/half_float/inner_product_common.h @@ -24,10 +24,104 @@ #if defined(__AVX__) +#include #include using namespace zvec::ailego; +namespace zvec::turbo::avx { + +//! Mask process of computing distance (FP16) +#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC) \ + switch (cnt) { \ + case 7: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(lhs) + 6), \ + *((const short *)(lhs) + 5), *((const short *)(lhs) + 4), \ + *((const short *)(lhs) + 3), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(rhs) + 6), \ + *((const short *)(rhs) + 5), *((const short *)(rhs) + 4), \ + *((const short *)(rhs) + 3), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 6: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2), \ + *((const int *)(lhs) + 1), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2), \ + *((const int *)(rhs) + 1), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 5: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(lhs) + 4), *((const short *)(lhs) + 3), \ + *((const short *)(lhs) + 2), *((const short *)(lhs) + 1), \ + *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(rhs) + 4), *((const short *)(rhs) + 3), \ + *((const short *)(rhs) + 2), *((const short *)(rhs) + 1), \ + *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 4: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 3: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 2: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 1: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + } + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + //! Calculate Fused-Multiply-Add (AVX) #define FMA_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ ymm_sum = _mm256_fmadd_ps(ymm_m, ymm_q, ymm_sum); @@ -37,10 +131,22 @@ using namespace zvec::ailego; #define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); - #define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_AVX(m, q, _RES, _LOAD, _PROC) \ + { \ + __m256i ymm_mi = _LOAD((const __m256i *)m); \ + __m256i ymm_qi = _LOAD((const __m256i *)q); \ + __m256 ymm_m = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_mi)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_qi)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + ymm_m = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_mi, 1)); \ + ymm_q = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_qi, 1)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + } + //! Compute the distance between matrix and query (FP16, M=1, N=1) #define ACCUM_FP16_1X1_AVX(m, q, dim, out, _MASK, _NORM) \ MATRIX_VAR_INIT(1, 1, __m256, ymm_sum, _mm256_setzero_ps()) \ @@ -74,4 +180,6 @@ using namespace zvec::ailego; MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \ *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0)); +} // namespace zvec::turbo::avx + #endif \ No newline at end of file From f9fe8ae7fe18c3fb2ba6db6961196eb9f7008611 Mon Sep 17 00:00:00 2001 From: ray Date: Thu, 2 Apr 2026 12:55:09 +0800 Subject: [PATCH 22/44] feat: add dist funcs --- src/turbo/avx/float32/inner_product.cc | 2 +- src/turbo/avx512/half_float/common.h | 285 +--------------- src/turbo/avx512/half_float/cosine.cc | 18 +- src/turbo/avx512/half_float/cosine.h | 8 +- src/turbo/avx512/half_float/inner_product.cc | 18 +- src/turbo/avx512/half_float/inner_product.h | 10 +- .../avx512/half_float/squared_euclidean.cc | 22 +- .../avx512/half_float/squared_euclidean.h | 8 +- src/turbo/avx512fp16/half_float/common.h | 35 ++ src/turbo/avx512fp16/half_float/cosine.cc | 49 +++ src/turbo/avx512fp16/half_float/cosine.h | 30 ++ .../avx512fp16/half_float/inner_product.cc | 45 +++ .../avx512fp16/half_float/inner_product.h | 31 ++ .../half_float/squared_euclidean.cc | 49 +++ .../avx512fp16/half_float/squared_euclidean.h | 31 ++ .../avx512fp16/half_float_converter/common.h | 312 ------------------ .../scalar/{float16 => half_float}/cosine.cc | 4 +- .../scalar/{float16 => half_float}/cosine.h | 0 .../{float16 => half_float}/inner_product.cc | 2 +- .../{float16 => half_float}/inner_product.h | 0 .../squared_euclidean.cc | 2 +- .../squared_euclidean.h | 0 src/turbo/turbo.cc | 50 ++- tests/turbo/turbo_inner_product_test.cc | 4 +- 24 files changed, 358 insertions(+), 657 deletions(-) create mode 100644 src/turbo/avx512fp16/half_float/common.h create mode 100644 src/turbo/avx512fp16/half_float/cosine.cc create mode 100644 src/turbo/avx512fp16/half_float/cosine.h create mode 100644 src/turbo/avx512fp16/half_float/inner_product.cc create mode 100644 src/turbo/avx512fp16/half_float/inner_product.h create mode 100644 src/turbo/avx512fp16/half_float/squared_euclidean.cc create mode 100644 src/turbo/avx512fp16/half_float/squared_euclidean.h delete mode 100644 src/turbo/avx512fp16/half_float_converter/common.h rename src/turbo/scalar/{float16 => half_float}/cosine.cc (93%) rename src/turbo/scalar/{float16 => half_float}/cosine.h (100%) rename src/turbo/scalar/{float16 => half_float}/inner_product.cc (97%) rename src/turbo/scalar/{float16 => half_float}/inner_product.h (100%) rename src/turbo/scalar/{float16 => half_float}/squared_euclidean.cc (96%) rename src/turbo/scalar/{float16 => half_float}/squared_euclidean.h (100%) diff --git a/src/turbo/avx/float32/inner_product.cc b/src/turbo/avx/float32/inner_product.cc index 7e379721d..94ed2b0cd 100644 --- a/src/turbo/avx/float32/inner_product.cc +++ b/src/turbo/avx/float32/inner_product.cc @@ -93,7 +93,7 @@ void inner_product_fp32_distance(const void *a, const void *b, size_t dim, case 1: FMA_FP32_GENERAL(lhs[0], rhs[0], result) } - *distance = result; + *distance = -1 * result; #else (void)a; (void)b; diff --git a/src/turbo/avx512/half_float/common.h b/src/turbo/avx512/half_float/common.h index 55fb5898c..ed8171c21 100644 --- a/src/turbo/avx512/half_float/common.h +++ b/src/turbo/avx512/half_float/common.h @@ -22,291 +22,14 @@ #pragma once -#if defined(__AVX512VNNI__) +#if defined(__AVX512F__) #include #include #include -namespace zvec::turbo::avx512_vnni::internal { +namespace zvec::turbo::avx512::internal { -static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { - __m256i x1 = _mm256_hadd_epi32(v, v); - __m256i x2 = _mm256_hadd_epi32(x1, x1); - __m128i x3 = _mm256_extractf128_si256(x2, 1); - __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); - return _mm_cvtsi128_si32(x4); -} -#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast(m * q); +} // namespace zvec::turbo::avx512::internal -// Compute the raw integer inner product of two int8 vectors of length `size`. -// The result is written to `*distance` as a float. -// Both `a` and `b` must point to int8_t arrays. -static __attribute__((always_inline)) void ip_int8_avx512_vnni( - const void *a, const void *b, size_t size, float *distance) { - const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001); - const __m128i ONES_INT16_SSE = _mm_set1_epi32(0x00010001); - - const int8_t *lhs = reinterpret_cast(a); - const int8_t *rhs = reinterpret_cast(b); - - const int8_t *last = lhs + size; - const int8_t *last_aligned = lhs + ((size >> 6) << 6); - - float result = 0.0f; - - __m256i ymm_sum_0 = _mm256_setzero_si256(); - __m256i ymm_sum_1 = _mm256_setzero_si256(); - - if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { - for (; lhs != last_aligned; lhs += 64, rhs += 64) { - __m256i ymm_lhs_0 = _mm256_load_si256((const __m256i *)(lhs + 0)); - __m256i ymm_lhs_1 = _mm256_load_si256((const __m256i *)(lhs + 32)); - __m256i ymm_rhs_0 = _mm256_load_si256((const __m256i *)(rhs + 0)); - __m256i ymm_rhs_1 = _mm256_load_si256((const __m256i *)(rhs + 32)); - - ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); - ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); - ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); - ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); - - ymm_sum_0 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), - ONES_INT16_AVX), - ymm_sum_0); - ymm_sum_1 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), - ONES_INT16_AVX), - ymm_sum_1); - } - - if (last >= last_aligned + 32) { - __m256i ymm_lhs = _mm256_load_si256((const __m256i *)lhs); - __m256i ymm_rhs = _mm256_load_si256((const __m256i *)rhs); - ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); - ymm_rhs = _mm256_abs_epi8(ymm_rhs); - ymm_sum_0 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), - ONES_INT16_AVX), - ymm_sum_0); - lhs += 32; - rhs += 32; - } - - if (last >= lhs + 16) { - __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); - __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); - xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); - xmm_rhs = _mm_abs_epi8(xmm_rhs); - ymm_sum_0 = _mm256_add_epi32( - _mm256_set_m128i(_mm_setzero_si128(), - _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), - ONES_INT16_SSE)), - ymm_sum_0); - lhs += 16; - rhs += 16; - } - } else { - for (; lhs != last_aligned; lhs += 64, rhs += 64) { - __m256i ymm_lhs_0 = _mm256_loadu_si256((const __m256i *)(lhs + 0)); - __m256i ymm_lhs_1 = _mm256_loadu_si256((const __m256i *)(lhs + 32)); - __m256i ymm_rhs_0 = _mm256_loadu_si256((const __m256i *)(rhs + 0)); - __m256i ymm_rhs_1 = _mm256_loadu_si256((const __m256i *)(rhs + 32)); - - ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); - ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); - ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); - ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); - - ymm_sum_0 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), - ONES_INT16_AVX), - ymm_sum_0); - ymm_sum_1 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), - ONES_INT16_AVX), - ymm_sum_1); - } - - if (last >= last_aligned + 32) { - __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)lhs); - __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)rhs); - ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); - ymm_rhs = _mm256_abs_epi8(ymm_rhs); - ymm_sum_0 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), - ONES_INT16_AVX), - ymm_sum_0); - lhs += 32; - rhs += 32; - } - - if (last >= lhs + 16) { - __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); - __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); - xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); - xmm_rhs = _mm_abs_epi8(xmm_rhs); - ymm_sum_0 = _mm256_add_epi32( - _mm256_set_m128i(_mm_setzero_si128(), - _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), - ONES_INT16_SSE)), - ymm_sum_0); - lhs += 16; - rhs += 16; - } - } - result = static_cast( - HorizontalAdd_INT32_V256(_mm256_add_epi32(ymm_sum_0, ymm_sum_1))); - - switch (last - lhs) { - case 15: - FMA_INT8_GENERAL(lhs[14], rhs[14], result) - /* FALLTHRU */ - case 14: - FMA_INT8_GENERAL(lhs[13], rhs[13], result) - /* FALLTHRU */ - case 13: - FMA_INT8_GENERAL(lhs[12], rhs[12], result) - /* FALLTHRU */ - case 12: - FMA_INT8_GENERAL(lhs[11], rhs[11], result) - /* FALLTHRU */ - case 11: - FMA_INT8_GENERAL(lhs[10], rhs[10], result) - /* FALLTHRU */ - case 10: - FMA_INT8_GENERAL(lhs[9], rhs[9], result) - /* FALLTHRU */ - case 9: - FMA_INT8_GENERAL(lhs[8], rhs[8], result) - /* FALLTHRU */ - case 8: - FMA_INT8_GENERAL(lhs[7], rhs[7], result) - /* FALLTHRU */ - case 7: - FMA_INT8_GENERAL(lhs[6], rhs[6], result) - /* FALLTHRU */ - case 6: - FMA_INT8_GENERAL(lhs[5], rhs[5], result) - /* FALLTHRU */ - case 5: - FMA_INT8_GENERAL(lhs[4], rhs[4], result) - /* FALLTHRU */ - case 4: - FMA_INT8_GENERAL(lhs[3], rhs[3], result) - /* FALLTHRU */ - case 3: - FMA_INT8_GENERAL(lhs[2], rhs[2], result) - /* FALLTHRU */ - case 2: - FMA_INT8_GENERAL(lhs[1], rhs[1], result) - /* FALLTHRU */ - case 1: - FMA_INT8_GENERAL(lhs[0], rhs[0], result) - } - *distance = result; -} - -#undef FMA_INT8_GENERAL - -// Shift the first `original_dim` bytes of `query` in-place from int8 to uint8 -// by adding 128 to each element. The metadata tail beyond `original_dim` is -// left untouched. This prepares the query for use with dpbusd (uint8 * int8). -static __attribute__((always_inline)) void shift_int8_to_uint8_avx512( - void *query, size_t original_dim) { - const int8_t *input = reinterpret_cast(query); - uint8_t *output = reinterpret_cast(query); - - // 128 represented as int8_t wraps to -128, but two's complement addition - // produces the correct uint8 result. - const __m512i offset = _mm512_set1_epi8(static_cast(128)); - - size_t i = 0; - for (; i + 64 <= original_dim; i += 64) { - __m512i data = - _mm512_loadu_si512(reinterpret_cast(input + i)); - __m512i shifted = _mm512_add_epi8(data, offset); - _mm512_storeu_si512(reinterpret_cast<__m512i *>(output + i), shifted); - } - for (; i < original_dim; ++i) { - output[i] = static_cast(static_cast(input[i]) + 128); - } -} - -// Compute raw integer inner products for a batch of int8 vectors against a -// single query. Uses AVX512-VNNI dpbusd instruction. -// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. -template -__attribute__((always_inline)) void ip_int8_batch_avx512_vnni_impl( - const void *query, const void *const *vectors, - const std::array &prefetch_ptrs, - size_t dimensionality, float *distances) { - __m512i accs[batch_size]; - for (size_t i = 0; i < batch_size; ++i) { - accs[i] = _mm512_setzero_si512(); - } - size_t dim = 0; - for (; dim + 64 <= dimensionality; dim += 64) { - __m512i q = _mm512_loadu_si512(reinterpret_cast( - reinterpret_cast(query) + dim)); - __m512i data_regs[batch_size]; - for (size_t i = 0; i < batch_size; ++i) { - data_regs[i] = _mm512_loadu_si512(reinterpret_cast( - reinterpret_cast(vectors[i]) + dim)); - } - for (size_t i = 0; i < batch_size; ++i) { - if (prefetch_ptrs[i]) { - _mm_prefetch( - reinterpret_cast( - reinterpret_cast(prefetch_ptrs[i]) + dim), - _MM_HINT_T0); - } - accs[i] = _mm512_dpbusd_epi32(accs[i], q, data_regs[i]); - } - } - std::array temp_results{}; - for (size_t i = 0; i < batch_size; ++i) { - temp_results[i] = _mm512_reduce_add_epi32(accs[i]); - } - for (; dim < dimensionality; ++dim) { - int q = static_cast(reinterpret_cast(query)[dim]); - for (size_t i = 0; i < batch_size; ++i) { - temp_results[i] += - q * - static_cast(reinterpret_cast(vectors[i])[dim]); - } - } - for (size_t i = 0; i < batch_size; ++i) { - distances[i] = static_cast(temp_results[i]); - } -} - -// Dispatch batched inner product over all `n` vectors with prefetching. -static __attribute__((always_inline)) void ip_int8_batch_avx512_vnni( - const void *const *vectors, const void *query, size_t n, size_t dim, - float *distances) { - static constexpr size_t batch_size = 2; - static constexpr size_t prefetch_step = 2; - size_t i = 0; - for (; i + batch_size <= n; i += batch_size) { - std::array prefetch_ptrs; - for (size_t j = 0; j < batch_size; ++j) { - if (i + j + batch_size * prefetch_step < n) { - prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; - } else { - prefetch_ptrs[j] = nullptr; - } - } - ip_int8_batch_avx512_vnni_impl( - query, &vectors[i], prefetch_ptrs, dim, distances + i); - } - for (; i < n; i++) { - std::array prefetch_ptrs{nullptr}; - ip_int8_batch_avx512_vnni_impl<1>(query, &vectors[i], prefetch_ptrs, dim, - distances + i); - } -} - -} // namespace zvec::turbo::avx512_vnni::internal - -#endif // defined(__AVX512VNNI__) +#endif // defined(__AVX512F__) diff --git a/src/turbo/avx512/half_float/cosine.cc b/src/turbo/avx512/half_float/cosine.cc index 76791ad8a..e81e28f8f 100644 --- a/src/turbo/avx512/half_float/cosine.cc +++ b/src/turbo/avx512/half_float/cosine.cc @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx/float32/cosine.h" -#include "avx/float32/common.h" +#include "avx512/half_float/cosine.h" +#include "avx512/half_float/common.h" -#if defined(__AVX__) +#if defined(__AVX512F__) #include #endif -namespace zvec::turbo::avx { +namespace zvec::turbo::avx512 { -void cosine_fp32_distance(const void *a, const void *b, size_t dim, +void cosine_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__AVX__) +#if defined(__AVX512F__) #else (void)a; @@ -33,9 +33,9 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, #endif // __AVX__ } -void cosine_fp32_batch_distance(const void *const *vectors, const void *query, +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__AVX__) +#if defined(__AVX512F__) #else (void)vectors; @@ -46,4 +46,4 @@ void cosine_fp32_batch_distance(const void *const *vectors, const void *query, #endif //__AVX__ } -} // namespace zvec::turbo::avx \ No newline at end of file +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/half_float/cosine.h b/src/turbo/avx512/half_float/cosine.h index 514a705e0..1e068dd6e 100644 --- a/src/turbo/avx512/half_float/cosine.h +++ b/src/turbo/avx512/half_float/cosine.h @@ -16,15 +16,15 @@ #include -namespace zvec::turbo::avx { +namespace zvec::turbo::avx512 { // Compute cosine distance (negative inner product after normalization) between // a single quantized FP32 vector pair. -void cosine_fp32_distance(const void *a, const void *b, size_t dim, +void cosine_fp16_distance(const void *a, const void *b, size_t dim, float *distance); // Batch version of cosine_fp32_distance. -void cosine_fp32_batch_distance(const void *const *vectors, const void *query, +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); -} // namespace zvec::turbo::avx \ No newline at end of file +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/half_float/inner_product.cc b/src/turbo/avx512/half_float/inner_product.cc index 5e34f0bb6..62463f8c7 100644 --- a/src/turbo/avx512/half_float/inner_product.cc +++ b/src/turbo/avx512/half_float/inner_product.cc @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx/float32/inner_product.h" -#include "avx/float32/common.h" +#include "avx512/half_float/inner_product.h" +#include "avx512/half_float/common.h" -#if defined(__AVX__) +#if defined(__AVX512F__) #include #endif -namespace zvec::turbo::avx { +namespace zvec::turbo::avx512 { -// Compute squared Euclidean distance between a single quantized FP32 +// Compute squared Euclidean distance between a single quantized FP16 // vector pair. -void inner_product_fp32_distance(const void *a, const void *b, size_t dim, +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { (void)a; (void)b; @@ -31,8 +31,8 @@ void inner_product_fp32_distance(const void *a, const void *b, size_t dim, (void)distance; } -// Batch version of inner_product_fp32_distance. -void inner_product_fp32_batch_distance(const void *const *vectors, +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { (void)vectors; @@ -42,4 +42,4 @@ void inner_product_fp32_batch_distance(const void *const *vectors, (void)distances; } -} // namespace zvec::turbo::avx \ No newline at end of file +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/half_float/inner_product.h b/src/turbo/avx512/half_float/inner_product.h index 083a35f6f..833d4c8c3 100644 --- a/src/turbo/avx512/half_float/inner_product.h +++ b/src/turbo/avx512/half_float/inner_product.h @@ -16,16 +16,16 @@ #include -namespace zvec::turbo::avx { +namespace zvec::turbo::avx512 { -// Compute inner product distance between a single quantized FP32 +// Compute inner product distance between a single quantized FP16 // vector pair. -void inner_product_fp32_distance(const void *a, const void *b, size_t dim, +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, float *distance); // Batch version of inner_product_fp32_distance. -void inner_product_fp32_batch_distance(const void *const *vectors, +void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); -} // namespace zvec::turbo::avx +} // namespace zvec::turbo::avx512 diff --git a/src/turbo/avx512/half_float/squared_euclidean.cc b/src/turbo/avx512/half_float/squared_euclidean.cc index 710738d24..3ef21757d 100644 --- a/src/turbo/avx512/half_float/squared_euclidean.cc +++ b/src/turbo/avx512/half_float/squared_euclidean.cc @@ -12,38 +12,38 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx/float32/squared_euclidean.h" -#include "avx/float32/common.h" +#include "avx512/half_float/squared_euclidean.h" +#include "avx512/half_float/common.h" -#if defined(__AVX__) +#if defined(__AVX512F__) #include #endif -namespace zvec::turbo::avx { +namespace zvec::turbo::avx512 { -void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__AVX__) +#if defined(__AVX512F__) #else (void)a; (void)b; (void)dim; (void)distance; -#endif // __AVX__ +#endif // __AVX512F__ } -void squared_euclidean_fp32_batch_distance(const void *const *vectors, +void squared_euclidean_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__AVX__) +#if defined(__AVX512F__) #else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; -#endif //__AVX__ +#endif //__AVX512F__ } -} // namespace zvec::turbo::avx \ No newline at end of file +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/half_float/squared_euclidean.h b/src/turbo/avx512/half_float/squared_euclidean.h index 9e11f15bc..399e238b0 100644 --- a/src/turbo/avx512/half_float/squared_euclidean.h +++ b/src/turbo/avx512/half_float/squared_euclidean.h @@ -16,16 +16,16 @@ #include -namespace zvec::turbo::avx { +namespace zvec::turbo::avx512 { // Compute squared euclidean distance between a single quantized FP32 // vector pair. -void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, float *distance); // Batch version of squared euclidean FP32. -void squared_euclidean_fp32_batch_distance(const void *const *vectors, +void squared_euclidean_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); -} // namespace zvec::turbo::avx +} // namespace zvec::turbo::avx512 diff --git a/src/turbo/avx512fp16/half_float/common.h b/src/turbo/avx512fp16/half_float/common.h new file mode 100644 index 000000000..da0574085 --- /dev/null +++ b/src/turbo/avx512fp16/half_float/common.h @@ -0,0 +1,35 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX512FP16__) +#include +#include +#include + +namespace zvec::turbo::avx512fp16::internal { + + +} // namespace zvec::turbo::avx512fp16::internal + +#endif // defined(__AVX512FP16__) diff --git a/src/turbo/avx512fp16/half_float/cosine.cc b/src/turbo/avx512fp16/half_float/cosine.cc new file mode 100644 index 000000000..4c65cd343 --- /dev/null +++ b/src/turbo/avx512fp16/half_float/cosine.cc @@ -0,0 +1,49 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512fp16/half_float/cosine.h" +#include "avx512fp16/half_float/common.h" + +#if defined(__AVX512FP16__) +#include +#endif + +namespace zvec::turbo::avx512fp16 { + +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512FP16__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX__ +} + +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__AVX512FP16__) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX__ +} + +} // namespace zvec::turbo::avx512fp16 \ No newline at end of file diff --git a/src/turbo/avx512fp16/half_float/cosine.h b/src/turbo/avx512fp16/half_float/cosine.h new file mode 100644 index 000000000..629bc9365 --- /dev/null +++ b/src/turbo/avx512fp16/half_float/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512fp16 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::avx512fp16 \ No newline at end of file diff --git a/src/turbo/avx512fp16/half_float/inner_product.cc b/src/turbo/avx512fp16/half_float/inner_product.cc new file mode 100644 index 000000000..1b2870c54 --- /dev/null +++ b/src/turbo/avx512fp16/half_float/inner_product.cc @@ -0,0 +1,45 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512fp16/half_float/inner_product.h" +#include "avx512fp16/half_float/common.h" + +#if defined(__AVX512FP16__) +#include +#endif + +namespace zvec::turbo::avx512fp16 { + +// Compute squared Euclidean distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { + (void)a; + (void)b; + (void)dim; + (void)distance; +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::avx512fp16 \ No newline at end of file diff --git a/src/turbo/avx512fp16/half_float/inner_product.h b/src/turbo/avx512fp16/half_float/inner_product.h new file mode 100644 index 000000000..dbd9d9f58 --- /dev/null +++ b/src/turbo/avx512fp16/half_float/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512fp16 { + +// Compute inner product distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx512fp16 diff --git a/src/turbo/avx512fp16/half_float/squared_euclidean.cc b/src/turbo/avx512fp16/half_float/squared_euclidean.cc new file mode 100644 index 000000000..cefd49b97 --- /dev/null +++ b/src/turbo/avx512fp16/half_float/squared_euclidean.cc @@ -0,0 +1,49 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512fp16/half_float/squared_euclidean.h" +#include "avx512fp16/half_float/common.h" + +#if defined(__AVX512F__) +#include +#endif + +namespace zvec::turbo::avx512fp16 { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512FP16__) + +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX512F__ +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX512FP16__) +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX512F__ +} + +} // namespace zvec::turbo::avx512fp16 \ No newline at end of file diff --git a/src/turbo/avx512fp16/half_float/squared_euclidean.h b/src/turbo/avx512fp16/half_float/squared_euclidean.h new file mode 100644 index 000000000..f3a13d3d2 --- /dev/null +++ b/src/turbo/avx512fp16/half_float/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512fp16 { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::avx512fp16 diff --git a/src/turbo/avx512fp16/half_float_converter/common.h b/src/turbo/avx512fp16/half_float_converter/common.h deleted file mode 100644 index 55fb5898c..000000000 --- a/src/turbo/avx512fp16/half_float_converter/common.h +++ /dev/null @@ -1,312 +0,0 @@ -// Copyright 2025-present the zvec project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - -#pragma once - -#if defined(__AVX512VNNI__) -#include -#include -#include - -namespace zvec::turbo::avx512_vnni::internal { - -static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { - __m256i x1 = _mm256_hadd_epi32(v, v); - __m256i x2 = _mm256_hadd_epi32(x1, x1); - __m128i x3 = _mm256_extractf128_si256(x2, 1); - __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); - return _mm_cvtsi128_si32(x4); -} - -#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast(m * q); - -// Compute the raw integer inner product of two int8 vectors of length `size`. -// The result is written to `*distance` as a float. -// Both `a` and `b` must point to int8_t arrays. -static __attribute__((always_inline)) void ip_int8_avx512_vnni( - const void *a, const void *b, size_t size, float *distance) { - const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001); - const __m128i ONES_INT16_SSE = _mm_set1_epi32(0x00010001); - - const int8_t *lhs = reinterpret_cast(a); - const int8_t *rhs = reinterpret_cast(b); - - const int8_t *last = lhs + size; - const int8_t *last_aligned = lhs + ((size >> 6) << 6); - - float result = 0.0f; - - __m256i ymm_sum_0 = _mm256_setzero_si256(); - __m256i ymm_sum_1 = _mm256_setzero_si256(); - - if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { - for (; lhs != last_aligned; lhs += 64, rhs += 64) { - __m256i ymm_lhs_0 = _mm256_load_si256((const __m256i *)(lhs + 0)); - __m256i ymm_lhs_1 = _mm256_load_si256((const __m256i *)(lhs + 32)); - __m256i ymm_rhs_0 = _mm256_load_si256((const __m256i *)(rhs + 0)); - __m256i ymm_rhs_1 = _mm256_load_si256((const __m256i *)(rhs + 32)); - - ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); - ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); - ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); - ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); - - ymm_sum_0 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), - ONES_INT16_AVX), - ymm_sum_0); - ymm_sum_1 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), - ONES_INT16_AVX), - ymm_sum_1); - } - - if (last >= last_aligned + 32) { - __m256i ymm_lhs = _mm256_load_si256((const __m256i *)lhs); - __m256i ymm_rhs = _mm256_load_si256((const __m256i *)rhs); - ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); - ymm_rhs = _mm256_abs_epi8(ymm_rhs); - ymm_sum_0 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), - ONES_INT16_AVX), - ymm_sum_0); - lhs += 32; - rhs += 32; - } - - if (last >= lhs + 16) { - __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); - __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); - xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); - xmm_rhs = _mm_abs_epi8(xmm_rhs); - ymm_sum_0 = _mm256_add_epi32( - _mm256_set_m128i(_mm_setzero_si128(), - _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), - ONES_INT16_SSE)), - ymm_sum_0); - lhs += 16; - rhs += 16; - } - } else { - for (; lhs != last_aligned; lhs += 64, rhs += 64) { - __m256i ymm_lhs_0 = _mm256_loadu_si256((const __m256i *)(lhs + 0)); - __m256i ymm_lhs_1 = _mm256_loadu_si256((const __m256i *)(lhs + 32)); - __m256i ymm_rhs_0 = _mm256_loadu_si256((const __m256i *)(rhs + 0)); - __m256i ymm_rhs_1 = _mm256_loadu_si256((const __m256i *)(rhs + 32)); - - ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); - ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); - ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); - ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); - - ymm_sum_0 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), - ONES_INT16_AVX), - ymm_sum_0); - ymm_sum_1 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), - ONES_INT16_AVX), - ymm_sum_1); - } - - if (last >= last_aligned + 32) { - __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)lhs); - __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)rhs); - ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); - ymm_rhs = _mm256_abs_epi8(ymm_rhs); - ymm_sum_0 = _mm256_add_epi32( - _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), - ONES_INT16_AVX), - ymm_sum_0); - lhs += 32; - rhs += 32; - } - - if (last >= lhs + 16) { - __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); - __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); - xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); - xmm_rhs = _mm_abs_epi8(xmm_rhs); - ymm_sum_0 = _mm256_add_epi32( - _mm256_set_m128i(_mm_setzero_si128(), - _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), - ONES_INT16_SSE)), - ymm_sum_0); - lhs += 16; - rhs += 16; - } - } - result = static_cast( - HorizontalAdd_INT32_V256(_mm256_add_epi32(ymm_sum_0, ymm_sum_1))); - - switch (last - lhs) { - case 15: - FMA_INT8_GENERAL(lhs[14], rhs[14], result) - /* FALLTHRU */ - case 14: - FMA_INT8_GENERAL(lhs[13], rhs[13], result) - /* FALLTHRU */ - case 13: - FMA_INT8_GENERAL(lhs[12], rhs[12], result) - /* FALLTHRU */ - case 12: - FMA_INT8_GENERAL(lhs[11], rhs[11], result) - /* FALLTHRU */ - case 11: - FMA_INT8_GENERAL(lhs[10], rhs[10], result) - /* FALLTHRU */ - case 10: - FMA_INT8_GENERAL(lhs[9], rhs[9], result) - /* FALLTHRU */ - case 9: - FMA_INT8_GENERAL(lhs[8], rhs[8], result) - /* FALLTHRU */ - case 8: - FMA_INT8_GENERAL(lhs[7], rhs[7], result) - /* FALLTHRU */ - case 7: - FMA_INT8_GENERAL(lhs[6], rhs[6], result) - /* FALLTHRU */ - case 6: - FMA_INT8_GENERAL(lhs[5], rhs[5], result) - /* FALLTHRU */ - case 5: - FMA_INT8_GENERAL(lhs[4], rhs[4], result) - /* FALLTHRU */ - case 4: - FMA_INT8_GENERAL(lhs[3], rhs[3], result) - /* FALLTHRU */ - case 3: - FMA_INT8_GENERAL(lhs[2], rhs[2], result) - /* FALLTHRU */ - case 2: - FMA_INT8_GENERAL(lhs[1], rhs[1], result) - /* FALLTHRU */ - case 1: - FMA_INT8_GENERAL(lhs[0], rhs[0], result) - } - *distance = result; -} - -#undef FMA_INT8_GENERAL - -// Shift the first `original_dim` bytes of `query` in-place from int8 to uint8 -// by adding 128 to each element. The metadata tail beyond `original_dim` is -// left untouched. This prepares the query for use with dpbusd (uint8 * int8). -static __attribute__((always_inline)) void shift_int8_to_uint8_avx512( - void *query, size_t original_dim) { - const int8_t *input = reinterpret_cast(query); - uint8_t *output = reinterpret_cast(query); - - // 128 represented as int8_t wraps to -128, but two's complement addition - // produces the correct uint8 result. - const __m512i offset = _mm512_set1_epi8(static_cast(128)); - - size_t i = 0; - for (; i + 64 <= original_dim; i += 64) { - __m512i data = - _mm512_loadu_si512(reinterpret_cast(input + i)); - __m512i shifted = _mm512_add_epi8(data, offset); - _mm512_storeu_si512(reinterpret_cast<__m512i *>(output + i), shifted); - } - for (; i < original_dim; ++i) { - output[i] = static_cast(static_cast(input[i]) + 128); - } -} - -// Compute raw integer inner products for a batch of int8 vectors against a -// single query. Uses AVX512-VNNI dpbusd instruction. -// `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. -template -__attribute__((always_inline)) void ip_int8_batch_avx512_vnni_impl( - const void *query, const void *const *vectors, - const std::array &prefetch_ptrs, - size_t dimensionality, float *distances) { - __m512i accs[batch_size]; - for (size_t i = 0; i < batch_size; ++i) { - accs[i] = _mm512_setzero_si512(); - } - size_t dim = 0; - for (; dim + 64 <= dimensionality; dim += 64) { - __m512i q = _mm512_loadu_si512(reinterpret_cast( - reinterpret_cast(query) + dim)); - __m512i data_regs[batch_size]; - for (size_t i = 0; i < batch_size; ++i) { - data_regs[i] = _mm512_loadu_si512(reinterpret_cast( - reinterpret_cast(vectors[i]) + dim)); - } - for (size_t i = 0; i < batch_size; ++i) { - if (prefetch_ptrs[i]) { - _mm_prefetch( - reinterpret_cast( - reinterpret_cast(prefetch_ptrs[i]) + dim), - _MM_HINT_T0); - } - accs[i] = _mm512_dpbusd_epi32(accs[i], q, data_regs[i]); - } - } - std::array temp_results{}; - for (size_t i = 0; i < batch_size; ++i) { - temp_results[i] = _mm512_reduce_add_epi32(accs[i]); - } - for (; dim < dimensionality; ++dim) { - int q = static_cast(reinterpret_cast(query)[dim]); - for (size_t i = 0; i < batch_size; ++i) { - temp_results[i] += - q * - static_cast(reinterpret_cast(vectors[i])[dim]); - } - } - for (size_t i = 0; i < batch_size; ++i) { - distances[i] = static_cast(temp_results[i]); - } -} - -// Dispatch batched inner product over all `n` vectors with prefetching. -static __attribute__((always_inline)) void ip_int8_batch_avx512_vnni( - const void *const *vectors, const void *query, size_t n, size_t dim, - float *distances) { - static constexpr size_t batch_size = 2; - static constexpr size_t prefetch_step = 2; - size_t i = 0; - for (; i + batch_size <= n; i += batch_size) { - std::array prefetch_ptrs; - for (size_t j = 0; j < batch_size; ++j) { - if (i + j + batch_size * prefetch_step < n) { - prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; - } else { - prefetch_ptrs[j] = nullptr; - } - } - ip_int8_batch_avx512_vnni_impl( - query, &vectors[i], prefetch_ptrs, dim, distances + i); - } - for (; i < n; i++) { - std::array prefetch_ptrs{nullptr}; - ip_int8_batch_avx512_vnni_impl<1>(query, &vectors[i], prefetch_ptrs, dim, - distances + i); - } -} - -} // namespace zvec::turbo::avx512_vnni::internal - -#endif // defined(__AVX512VNNI__) diff --git a/src/turbo/scalar/float16/cosine.cc b/src/turbo/scalar/half_float/cosine.cc similarity index 93% rename from src/turbo/scalar/float16/cosine.cc rename to src/turbo/scalar/half_float/cosine.cc index 4999cc8c2..7c46eb0f5 100644 --- a/src/turbo/scalar/float16/cosine.cc +++ b/src/turbo/scalar/half_float/cosine.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "scalar/float16/cosine.h" -#include "scalar/float16/inner_product.h" +#include "scalar/half_float/cosine.h" +#include "scalar/half_float/inner_product.h" namespace zvec::turbo::scalar { diff --git a/src/turbo/scalar/float16/cosine.h b/src/turbo/scalar/half_float/cosine.h similarity index 100% rename from src/turbo/scalar/float16/cosine.h rename to src/turbo/scalar/half_float/cosine.h diff --git a/src/turbo/scalar/float16/inner_product.cc b/src/turbo/scalar/half_float/inner_product.cc similarity index 97% rename from src/turbo/scalar/float16/inner_product.cc rename to src/turbo/scalar/half_float/inner_product.cc index e968a6c31..93cb41ec1 100644 --- a/src/turbo/scalar/float16/inner_product.cc +++ b/src/turbo/scalar/half_float/inner_product.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "scalar/float32/inner_product.h" +#include "scalar/half_float/inner_product.h" #include namespace zvec::turbo::scalar { diff --git a/src/turbo/scalar/float16/inner_product.h b/src/turbo/scalar/half_float/inner_product.h similarity index 100% rename from src/turbo/scalar/float16/inner_product.h rename to src/turbo/scalar/half_float/inner_product.h diff --git a/src/turbo/scalar/float16/squared_euclidean.cc b/src/turbo/scalar/half_float/squared_euclidean.cc similarity index 96% rename from src/turbo/scalar/float16/squared_euclidean.cc rename to src/turbo/scalar/half_float/squared_euclidean.cc index 53d46c0a1..0967ee01a 100644 --- a/src/turbo/scalar/float16/squared_euclidean.cc +++ b/src/turbo/scalar/half_float/squared_euclidean.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "scalar/float32/squared_euclidean.h" +#include "scalar/half_float/squared_euclidean.h" #include namespace zvec::turbo::scalar { diff --git a/src/turbo/scalar/float16/squared_euclidean.h b/src/turbo/scalar/half_float/squared_euclidean.h similarity index 100% rename from src/turbo/scalar/float16/squared_euclidean.h rename to src/turbo/scalar/half_float/squared_euclidean.h diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index 86893a069..97d8b1fed 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -17,6 +17,9 @@ #include "avx/float32/cosine.h" #include "avx/float32/inner_product.h" #include "avx/float32/squared_euclidean.h" +#include "avx/half_float/cosine.h" +#include "avx/half_float/inner_product.h" +#include "avx/half_float/squared_euclidean.h" #include "avx2/record_quantized_int4/cosine.h" #include "avx2/record_quantized_int4/inner_product.h" #include "avx2/record_quantized_int4/squared_euclidean.h" @@ -26,11 +29,20 @@ #include "avx512/float32/cosine.h" #include "avx512/float32/inner_product.h" #include "avx512/float32/squared_euclidean.h" +#include "avx512/half_float/cosine.h" +#include "avx512/half_float/inner_product.h" +#include "avx512/half_float/squared_euclidean.h" #include "avx512_vnni/record_quantized_int8/cosine.h" #include "avx512_vnni/record_quantized_int8/squared_euclidean.h" +#include "avx512fp16/half_float/cosine.h" +#include "avx512fp16/half_float/inner_product.h" +#include "avx512fp16/half_float/squared_euclidean.h" #include "scalar/float32/cosine.h" #include "scalar/float32/inner_product.h" #include "scalar/float32/squared_euclidean.h" +#include "scalar/half_float/cosine.h" +#include "scalar/half_float/inner_product.h" +#include "scalar/half_float/squared_euclidean.h" #include "scalar/record_quantized_int4/cosine.h" #include "scalar/record_quantized_int4/inner_product.h" #include "scalar/record_quantized_int4/squared_euclidean.h" @@ -150,7 +162,7 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, // FP32 if (data_type == DataType::kFp32) { if (quantize_type == QuantizeType::kDefault) { - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512F && (cpu_arch_type == CpuArchType::kAuto || cpu_arch_type == CpuArchType::kAVX512)) { if (metric_type == MetricType::kSquaredEuclidean) { @@ -164,7 +176,7 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, } } - if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE && + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX && (cpu_arch_type == CpuArchType::kAuto || cpu_arch_type == CpuArchType::kAVX)) { if (metric_type == MetricType::kSquaredEuclidean) { @@ -193,42 +205,50 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, // FP16 if (data_type == DataType::kFp16) { if (quantize_type == QuantizeType::kDefault) { - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_FP16 && (cpu_arch_type == CpuArchType::kAuto || - cpu_arch_type == CpuArchType::kAVX2)) { + cpu_arch_type == CpuArchType::kAVX512FP16)) { + if (metric_type == MetricType::kInnerProduct) { + return avx512fp16::inner_product_fp16_distance; + } + } + + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512F && + (cpu_arch_type == CpuArchType::kAuto || + cpu_arch_type == CpuArchType::kAVX512)) { if (metric_type == MetricType::kSquaredEuclidean) { - return avx2::squared_euclidean_int4_distance; + return avx512::squared_euclidean_fp16_distance; } if (metric_type == MetricType::kCosine) { - return avx2::cosine_int4_distance; + return avx512::cosine_fp16_distance; } if (metric_type == MetricType::kInnerProduct) { - return avx2::inner_product_int4_distance; + return avx512::inner_product_fp16_distance; } } - if (zvec::ailego::internal::CpuFeatures::static_flags_.SSE && + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX && (cpu_arch_type == CpuArchType::kAuto || - cpu_arch_type == CpuArchType::kSSE)) { + cpu_arch_type == CpuArchType::kAVX)) { if (metric_type == MetricType::kSquaredEuclidean) { - return sse::squared_euclidean_int4_distance; + return avx::squared_euclidean_fp16_distance; } if (metric_type == MetricType::kCosine) { - return sse::cosine_int4_distance; + return avx::cosine_fp16_distance; } if (metric_type == MetricType::kInnerProduct) { - return sse::inner_product_int4_distance; + return avx::inner_product_fp16_distance; } } if (metric_type == MetricType::kSquaredEuclidean) { - return scalar::squared_euclidean_int4_distance; + return scalar::squared_euclidean_fp16_distance; } if (metric_type == MetricType::kCosine) { - return scalar::cosine_int4_distance; + return scalar::cosine_fp16_distance; } if (metric_type == MetricType::kInnerProduct) { - return scalar::inner_product_int4_distance; + return scalar::inner_product_fp16_distance; } } } diff --git a/tests/turbo/turbo_inner_product_test.cc b/tests/turbo/turbo_inner_product_test.cc index d5ef7df49..f616d9d6f 100644 --- a/tests/turbo/turbo_inner_product_test.cc +++ b/tests/turbo/turbo_inner_product_test.cc @@ -92,11 +92,11 @@ TEST(InnerProductMetric, TestFp16InnerProduct) { turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); auto func_avx = turbo::get_distance_func( - turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); auto func_scalar = turbo::get_distance_func( - turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); ailego::NumericalVector query_vec(DIMENSION); From 2b23284edefbe98e0fdf2ec7e7fdafd767b1f468 Mon Sep 17 00:00:00 2001 From: ray Date: Thu, 2 Apr 2026 15:54:11 +0800 Subject: [PATCH 23/44] feat: add dist funcs --- src/turbo/CMakeLists.txt | 11 + src/turbo/avx/half_float/inner_product.cc | 2 +- .../avx/half_float/inner_product_common.h | 2 + src/turbo/avx/half_float/squared_euclidean.cc | 2 +- ...ed_common.h => squared_euclidean_common.h} | 1 - src/turbo/avx512/half_float/common.h | 35 --- src/turbo/avx512/half_float/cosine.cc | 9 +- src/turbo/avx512/half_float/inner_product.cc | 20 +- .../avx512/half_float/inner_product_common.h | 217 ++++++++++++++++++ .../avx512/half_float/squared_euclidean.cc | 13 +- .../half_float/squared_euclidean_common.h | 208 +++++++++++++++++ .../half_float/cosine.cc | 15 +- .../half_float/cosine.h | 4 +- .../avx512_fp16/half_float/inner_product.cc | 106 +++++++++ .../half_float/inner_product.h | 4 +- .../half_float/inner_product_common.h | 61 +++++ .../half_float/squared_euclidean.cc | 111 +++++++++ .../half_float/squared_euclidean.h | 4 +- .../half_float/squared_euclidean_common.h} | 26 ++- .../avx512fp16/half_float/inner_product.cc | 45 ---- .../half_float/squared_euclidean.cc | 49 ---- src/turbo/turbo.cc | 14 +- tests/turbo/turbo_inner_product_test.cc | 12 +- 23 files changed, 809 insertions(+), 162 deletions(-) rename src/turbo/avx/half_float/{euclidean_squared_common.h => squared_euclidean_common.h} (99%) delete mode 100644 src/turbo/avx512/half_float/common.h create mode 100644 src/turbo/avx512/half_float/inner_product_common.h create mode 100644 src/turbo/avx512/half_float/squared_euclidean_common.h rename src/turbo/{avx512fp16 => avx512_fp16}/half_float/cosine.cc (74%) rename src/turbo/{avx512fp16 => avx512_fp16}/half_float/cosine.h (93%) create mode 100644 src/turbo/avx512_fp16/half_float/inner_product.cc rename src/turbo/{avx512fp16 => avx512_fp16}/half_float/inner_product.h (93%) create mode 100644 src/turbo/avx512_fp16/half_float/inner_product_common.h create mode 100644 src/turbo/avx512_fp16/half_float/squared_euclidean.cc rename src/turbo/{avx512fp16 => avx512_fp16}/half_float/squared_euclidean.h (93%) rename src/turbo/{avx512fp16/half_float/common.h => avx512_fp16/half_float/squared_euclidean_common.h} (55%) delete mode 100644 src/turbo/avx512fp16/half_float/inner_product.cc delete mode 100644 src/turbo/avx512fp16/half_float/squared_euclidean.cc diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index 3a8ab6a2a..61442a45b 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -13,6 +13,17 @@ endif() file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h) +if(NOT ANDROID AND AUTO_DETECT_ARCH) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") + file(GLOB_RECURSE AVX512_VNNI_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512_fp16/*.cc) + set_source_files_properties( + ${AVX512_VNNI_SRCS} + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512FP16}" + ) + endif() +endif() + # Set per-file compile flags for AVX512-VNNI sources. # set_source_files_properties is directory-scoped, so it must be called in the # same directory that adds the sources to a target (i.e. here, not in a diff --git a/src/turbo/avx/half_float/inner_product.cc b/src/turbo/avx/half_float/inner_product.cc index 4836d461d..9ef2fadd5 100644 --- a/src/turbo/avx/half_float/inner_product.cc +++ b/src/turbo/avx/half_float/inner_product.cc @@ -29,7 +29,7 @@ void inner_product_fp16_distance(const void *a, const void *b, size_t dim, const ailego::Float16 *lhs = reinterpret_cast(a); const ailego::Float16 *rhs = reinterpret_cast(b); - ACCUM_FP16_1X1_AVX(lhs, rhs, dim, distance, 0ull, ) + ACCUM_FP16_1X1_AVX(lhs, rhs, dim, distance, 0ull, NEGATE_FP32_GENERAL) #else (void)a; (void)b; diff --git a/src/turbo/avx/half_float/inner_product_common.h b/src/turbo/avx/half_float/inner_product_common.h index f8f5f377d..51af98f28 100644 --- a/src/turbo/avx/half_float/inner_product_common.h +++ b/src/turbo/avx/half_float/inner_product_common.h @@ -30,6 +30,8 @@ using namespace zvec::ailego; namespace zvec::turbo::avx { +//! Reverse sign of value (GENERAL) +#define NEGATE_FP32_GENERAL(v) -(v) //! Mask process of computing distance (FP16) #define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC) \ diff --git a/src/turbo/avx/half_float/squared_euclidean.cc b/src/turbo/avx/half_float/squared_euclidean.cc index a3f894a95..4b7c700b2 100644 --- a/src/turbo/avx/half_float/squared_euclidean.cc +++ b/src/turbo/avx/half_float/squared_euclidean.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "avx/half_float/squared_euclidean.h" -#include "avx/half_float/euclidean_squared_common.h" +#include "avx/half_float/squared_euclidean_common.h" #if defined(__AVX__) #include diff --git a/src/turbo/avx/half_float/euclidean_squared_common.h b/src/turbo/avx/half_float/squared_euclidean_common.h similarity index 99% rename from src/turbo/avx/half_float/euclidean_squared_common.h rename to src/turbo/avx/half_float/squared_euclidean_common.h index 0e667a66b..edc5252af 100644 --- a/src/turbo/avx/half_float/euclidean_squared_common.h +++ b/src/turbo/avx/half_float/squared_euclidean_common.h @@ -31,7 +31,6 @@ using namespace zvec::ailego; namespace zvec::turbo::avx { - //! Mask process of computing distance (FP16) #define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC) \ switch (cnt) { \ diff --git a/src/turbo/avx512/half_float/common.h b/src/turbo/avx512/half_float/common.h deleted file mode 100644 index ed8171c21..000000000 --- a/src/turbo/avx512/half_float/common.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2025-present the zvec project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - -#pragma once - -#if defined(__AVX512F__) -#include -#include -#include - -namespace zvec::turbo::avx512::internal { - - -} // namespace zvec::turbo::avx512::internal - -#endif // defined(__AVX512F__) diff --git a/src/turbo/avx512/half_float/cosine.cc b/src/turbo/avx512/half_float/cosine.cc index e81e28f8f..84028f6dd 100644 --- a/src/turbo/avx512/half_float/cosine.cc +++ b/src/turbo/avx512/half_float/cosine.cc @@ -13,7 +13,8 @@ // limitations under the License. #include "avx512/half_float/cosine.h" -#include "avx512/half_float/common.h" +#include "avx512/half_float/inner_product.h" +#include "avx512/half_float/inner_product_common.h" #if defined(__AVX512F__) #include @@ -24,7 +25,13 @@ namespace zvec::turbo::avx512 { void cosine_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX512F__) + constexpr size_t extra_dim = 2; + size_t original_dim = dim - extra_dim; + float ip; + inner_product_fp16_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; #else (void)a; (void)b; diff --git a/src/turbo/avx512/half_float/inner_product.cc b/src/turbo/avx512/half_float/inner_product.cc index 62463f8c7..74611de3a 100644 --- a/src/turbo/avx512/half_float/inner_product.cc +++ b/src/turbo/avx512/half_float/inner_product.cc @@ -12,11 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx512/half_float/inner_product.h" -#include "avx512/half_float/common.h" +#include #if defined(__AVX512F__) #include +#include +#include "avx512/half_float/inner_product.h" +#include "avx512/half_float/inner_product_common.h" + +using namespace zvec::turbo::avx512::internal; #endif namespace zvec::turbo::avx512 { @@ -25,10 +29,14 @@ namespace zvec::turbo::avx512 { // vector pair. void inner_product_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { - (void)a; - (void)b; - (void)dim; - (void)distance; +#if defined(__AVX512F__) + const zvec::ailego::Float16 *lhs = + reinterpret_cast(a); + const zvec::ailego::Float16 *rhs = + reinterpret_cast(b); + + ACCUM_FP16_1X1_AVX512(lhs, rhs, dim, distance, 0ull, NEGATE_FP32_GENERAL) +#endif } // Batch version of inner_product_fp16_distance. diff --git a/src/turbo/avx512/half_float/inner_product_common.h b/src/turbo/avx512/half_float/inner_product_common.h new file mode 100644 index 000000000..4f36ee1e8 --- /dev/null +++ b/src/turbo/avx512/half_float/inner_product_common.h @@ -0,0 +1,217 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX512F__) +#include +#include +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::avx512::internal { +//! Reverse sign of value (GENERAL) +#define NEGATE_FP32_GENERAL(v) -(v) + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_AVX512(m, q, _RES, _LOAD, _PROC) \ + { \ + __m512i zmm_mi = _LOAD((const __m512i *)m); \ + __m512i zmm_qi = _LOAD((const __m512i *)q); \ + __m512 zmm_m = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_mi)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_qi)); \ + _PROC(zmm_m, zmm_q, _RES##_0_0); \ + zmm_m = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_mi, 1)); \ + zmm_q = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_qi, 1)); \ + _PROC(zmm_m, zmm_q, _RES##_0_0); \ + } + +//! Mask process of computing distance (FP16) +#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC) \ + switch (cnt) { \ + case 7: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(lhs) + 6), \ + *((const short *)(lhs) + 5), *((const short *)(lhs) + 4), \ + *((const short *)(lhs) + 3), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(rhs) + 6), \ + *((const short *)(rhs) + 5), *((const short *)(rhs) + 4), \ + *((const short *)(rhs) + 3), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 6: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2), \ + *((const int *)(lhs) + 1), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2), \ + *((const int *)(rhs) + 1), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 5: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(lhs) + 4), *((const short *)(lhs) + 3), \ + *((const short *)(lhs) + 2), *((const short *)(lhs) + 1), \ + *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(rhs) + 4), *((const short *)(rhs) + 3), \ + *((const short *)(rhs) + 2), *((const short *)(rhs) + 1), \ + *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 4: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 3: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 2: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 1: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + } + +//! Calculate Fused-Multiply-Add (AVX) +#define FMA_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ + ymm_sum = _mm256_fmadd_ps(ymm_m, ymm_q, ymm_sum); + +#define ACCUM_FP32_STEP_AVX FMA_FP32_AVX + +//! Calculate Fused-Multiply-Add (AVX512) +#define FMA_FP32_AVX512(zmm_m, zmm_q, zmm_sum) \ + zmm_sum = _mm512_fmadd_ps(zmm_m, zmm_q, zmm_sum); + +#define ACCUM_FP32_STEP_AVX512 FMA_FP32_AVX512 + +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_AVX(m, q, _RES, _LOAD, _PROC) \ + { \ + __m256i ymm_mi = _LOAD((const __m256i *)m); \ + __m256i ymm_qi = _LOAD((const __m256i *)q); \ + __m256 ymm_m = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_mi)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm256_castsi256_si128(ymm_qi)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + ymm_m = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_mi, 1)); \ + ymm_q = _mm256_cvtph_ps(_mm256_extractf128_si256(ymm_qi, 1)); \ + _PROC(ymm_m, ymm_q, _RES##_0_0); \ + } + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_AVX512(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, __m512, zmm_sum, _mm512_setzero_ps()) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 5) << 5); \ + if (((uintptr_t)m & 0x3f) == 0 && ((uintptr_t)q & 0x3f) == 0) { \ + for (; q != qe_aligned; m += 32, q += 32) { \ + MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_load_si512, \ + ACCUM_FP32_STEP_AVX512) \ + } \ + if (qe >= qe_aligned + 16) { \ + __m512 zmm_m = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)m)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)q)); \ + ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0) \ + m += 16; \ + q += 16; \ + } \ + } else { \ + for (; q != qe_aligned; m += 32, q += 32) { \ + MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_loadu_si512, \ + ACCUM_FP32_STEP_AVX512) \ + } \ + if (qe >= qe_aligned + 16) { \ + __m512 zmm_m = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)m)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)q)); \ + ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0) \ + m += 16; \ + q += 16; \ + } \ + } \ + __m256 ymm_sum_0_0 = _mm256_add_ps(_mm512_castps512_ps256(zmm_sum_0_0), \ + _mm256_castpd_ps(_mm512_extractf64x4_pd( \ + _mm512_castps_pd(zmm_sum_0_0), 1))); \ + if (qe >= q + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \ + *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0)); + +} // namespace zvec::turbo::avx512::internal + +#endif // defined(__AVX512F__) diff --git a/src/turbo/avx512/half_float/squared_euclidean.cc b/src/turbo/avx512/half_float/squared_euclidean.cc index 3ef21757d..8fceea89a 100644 --- a/src/turbo/avx512/half_float/squared_euclidean.cc +++ b/src/turbo/avx512/half_float/squared_euclidean.cc @@ -12,11 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx512/half_float/squared_euclidean.h" -#include "avx512/half_float/common.h" +#include #if defined(__AVX512F__) #include +#include +#include "avx512/half_float/squared_euclidean.h" +#include "avx512/half_float/squared_euclidean_common.h" + +using namespace zvec::turbo::avx512::internal; #endif namespace zvec::turbo::avx512 { @@ -24,7 +28,12 @@ namespace zvec::turbo::avx512 { void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX512F__) + const zvec::ailego::Float16 *lhs = + reinterpret_cast(a); + const zvec::ailego::Float16 *rhs = + reinterpret_cast(b); + ACCUM_FP16_1X1_AVX512(lhs, rhs, dim, distance, 0ull, ) #else (void)a; (void)b; diff --git a/src/turbo/avx512/half_float/squared_euclidean_common.h b/src/turbo/avx512/half_float/squared_euclidean_common.h new file mode 100644 index 000000000..d05842495 --- /dev/null +++ b/src/turbo/avx512/half_float/squared_euclidean_common.h @@ -0,0 +1,208 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX512F__) +#include +#include +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::avx512::internal { + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_AVX512(m, q, _RES, _LOAD, _PROC) \ + { \ + __m512i zmm_mi = _LOAD((const __m512i *)m); \ + __m512i zmm_qi = _LOAD((const __m512i *)q); \ + __m512 zmm_m = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_mi)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm512_castsi512_si256(zmm_qi)); \ + _PROC(zmm_m, zmm_q, _RES##_0_0); \ + zmm_m = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_mi, 1)); \ + zmm_q = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(zmm_qi, 1)); \ + _PROC(zmm_m, zmm_q, _RES##_0_0); \ + } + +//! Mask process of computing distance (FP16) +#define MATRIX_FP16_MASK_AVX(lhs, rhs, cnt, _MASK, _RES, _PROC) \ + switch (cnt) { \ + case 7: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(lhs) + 6), \ + *((const short *)(lhs) + 5), *((const short *)(lhs) + 4), \ + *((const short *)(lhs) + 3), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), *((const short *)(rhs) + 6), \ + *((const short *)(rhs) + 5), *((const short *)(rhs) + 4), \ + *((const short *)(rhs) + 3), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 6: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(lhs) + 2), \ + *((const int *)(lhs) + 1), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi32((int)(_MASK), *((const int *)(rhs) + 2), \ + *((const int *)(rhs) + 1), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 5: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(lhs) + 4), *((const short *)(lhs) + 3), \ + *((const short *)(lhs) + 2), *((const short *)(lhs) + 1), \ + *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + *((const short *)(rhs) + 4), *((const short *)(rhs) + 3), \ + *((const short *)(rhs) + 2), *((const short *)(rhs) + 1), \ + *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 4: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi64((__m64)(_MASK), *((const __m64 *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 3: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(lhs) + 2), \ + *((const short *)(lhs) + 1), *((const short *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi16( \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), *((const short *)(rhs) + 2), \ + *((const short *)(rhs) + 1), *((const short *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 2: { \ + __m256 ymm_lhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(lhs)))); \ + __m256 ymm_rhs = _mm256_cvtph_ps(_mm_set_epi32( \ + (int)(_MASK), (int)(_MASK), (int)(_MASK), *((const int *)(rhs)))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + case 1: { \ + __m256 ymm_lhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(lhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + __m256 ymm_rhs = _mm256_cvtph_ps( \ + _mm_set_epi16(*((const short *)(rhs)), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK), \ + (short)(_MASK), (short)(_MASK), (short)(_MASK))); \ + _PROC(ymm_lhs, ymm_rhs, _RES##_0_0) \ + break; \ + } \ + } + +//! Calculate sum of squared difference (AVX) +#define SSD_FP32_AVX(ymm_m, ymm_q, ymm_sum) \ + { \ + __m256 ymm_d = _mm256_sub_ps(ymm_m, ymm_q); \ + ymm_sum = _mm256_fmadd_ps(ymm_d, ymm_d, ymm_sum); \ + } + +#define ACCUM_FP32_STEP_AVX SSD_FP32_AVX + +//! Calculate sum of squared difference (AVX512) +#define SSD_FP32_AVX512(zmm_m, zmm_q, zmm_sum) \ + { \ + __m512 zmm_d = _mm512_sub_ps(zmm_m, zmm_q); \ + zmm_sum = _mm512_fmadd_ps(zmm_d, zmm_d, zmm_sum); \ + } + +#define ACCUM_FP32_STEP_AVX512 SSD_FP32_AVX512 + +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_AVX512(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, __m512, zmm_sum, _mm512_setzero_ps()) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 5) << 5); \ + if (((uintptr_t)m & 0x3f) == 0 && ((uintptr_t)q & 0x3f) == 0) { \ + for (; q != qe_aligned; m += 32, q += 32) { \ + MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_load_si512, \ + ACCUM_FP32_STEP_AVX512) \ + } \ + if (qe >= qe_aligned + 16) { \ + __m512 zmm_m = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)m)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)q)); \ + ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0) \ + m += 16; \ + q += 16; \ + } \ + } else { \ + for (; q != qe_aligned; m += 32, q += 32) { \ + MATRIX_FP16_ITER_1X1_AVX512(m, q, zmm_sum, _mm512_loadu_si512, \ + ACCUM_FP32_STEP_AVX512) \ + } \ + if (qe >= qe_aligned + 16) { \ + __m512 zmm_m = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)m)); \ + __m512 zmm_q = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)q)); \ + ACCUM_FP32_STEP_AVX512(zmm_m, zmm_q, zmm_sum_0_0) \ + m += 16; \ + q += 16; \ + } \ + } \ + __m256 ymm_sum_0_0 = _mm256_add_ps(_mm512_castps512_ps256(zmm_sum_0_0), \ + _mm256_castpd_ps(_mm512_extractf64x4_pd( \ + _mm512_castps_pd(zmm_sum_0_0), 1))); \ + if (qe >= q + 8) { \ + __m256 ymm_m = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)m)); \ + __m256 ymm_q = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q)); \ + ACCUM_FP32_STEP_AVX(ymm_m, ymm_q, ymm_sum_0_0) \ + m += 8; \ + q += 8; \ + } \ + MATRIX_FP16_MASK_AVX(m, q, (qe - q), _MASK, ymm_sum, ACCUM_FP32_STEP_AVX) \ + *out = _NORM(HorizontalAdd_FP32_V256(ymm_sum_0_0)); + +} // namespace zvec::turbo::avx512::internal + +#endif // defined(__AVX512F__) diff --git a/src/turbo/avx512fp16/half_float/cosine.cc b/src/turbo/avx512_fp16/half_float/cosine.cc similarity index 74% rename from src/turbo/avx512fp16/half_float/cosine.cc rename to src/turbo/avx512_fp16/half_float/cosine.cc index 4c65cd343..863d3ead8 100644 --- a/src/turbo/avx512fp16/half_float/cosine.cc +++ b/src/turbo/avx512_fp16/half_float/cosine.cc @@ -12,19 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "avx512fp16/half_float/cosine.h" -#include "avx512fp16/half_float/common.h" +#include "avx512_fp16/half_float/cosine.h" +#include "avx512_fp16/half_float/inner_product.h" +#include "avx512_fp16/half_float/inner_product_common.h" #if defined(__AVX512FP16__) #include #endif -namespace zvec::turbo::avx512fp16 { +namespace zvec::turbo::avx512_fp16 { void cosine_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX512FP16__) + constexpr size_t extra_dim = 2; + size_t original_dim = dim - extra_dim; + float ip; + inner_product_fp16_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; #else (void)a; (void)b; @@ -46,4 +53,4 @@ void cosine_fp16_batch_distance(const void *const *vectors, const void *query, #endif //__AVX__ } -} // namespace zvec::turbo::avx512fp16 \ No newline at end of file +} // namespace zvec::turbo::avx512_fp16 \ No newline at end of file diff --git a/src/turbo/avx512fp16/half_float/cosine.h b/src/turbo/avx512_fp16/half_float/cosine.h similarity index 93% rename from src/turbo/avx512fp16/half_float/cosine.h rename to src/turbo/avx512_fp16/half_float/cosine.h index 629bc9365..2b57bcf9e 100644 --- a/src/turbo/avx512fp16/half_float/cosine.h +++ b/src/turbo/avx512_fp16/half_float/cosine.h @@ -16,7 +16,7 @@ #include -namespace zvec::turbo::avx512fp16 { +namespace zvec::turbo::avx512_fp16 { // Compute cosine distance (negative inner product after normalization) between // a single quantized FP32 vector pair. @@ -27,4 +27,4 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, void cosine_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); -} // namespace zvec::turbo::avx512fp16 \ No newline at end of file +} // namespace zvec::turbo::avx512_fp16 \ No newline at end of file diff --git a/src/turbo/avx512_fp16/half_float/inner_product.cc b/src/turbo/avx512_fp16/half_float/inner_product.cc new file mode 100644 index 000000000..3feccaab7 --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/inner_product.cc @@ -0,0 +1,106 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__AVX512FP16__) +#include +#include +#include "avx512_fp16/half_float/inner_product.h" +#include "avx512_fp16/half_float/inner_product_common.h" + +using namespace zvec::ailego; + +using namespace zvec::turbo::avx512_fp16::internal; + +#endif + +namespace zvec::turbo::avx512_fp16 { + +// Compute squared Euclidean distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512FP16__) + const Float16 *lhs = reinterpret_cast(a); + const Float16 *rhs = reinterpret_cast(b); + + const Float16 *last = lhs + dim; + const Float16 *last_aligned = lhs + ((dim >> 6) << 6); + + __m512h zmm_sum_0 = _mm512_setzero_ph(); + __m512h zmm_sum_1 = _mm512_setzero_ph(); + + if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + FMA_FP16_AVX512FP16(_mm512_load_ph(lhs + 0), _mm512_load_ph(rhs + 0), + zmm_sum_0) + + FMA_FP16_AVX512FP16(_mm512_load_ph(lhs + 32), _mm512_load_ph(rhs + 32), + zmm_sum_1) + } + + if (last >= last_aligned + 32) { + FMA_FP16_AVX512FP16(_mm512_load_ph(lhs), _mm512_load_ph(rhs), zmm_sum_0) + lhs += 32; + rhs += 32; + } + } else { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + FMA_FP16_AVX512FP16(_mm512_loadu_ph(lhs + 0), _mm512_loadu_ph(rhs + 0), + zmm_sum_0) + + FMA_FP16_AVX512FP16(_mm512_loadu_ph(lhs + 32), _mm512_loadu_ph(rhs + 32), + zmm_sum_1) + } + + if (last >= last_aligned + 32) { + FMA_FP16_AVX512FP16(_mm512_loadu_ph(lhs), _mm512_loadu_ph(rhs), zmm_sum_0) + lhs += 32; + rhs += 32; + } + } + + zmm_sum_0 = _mm512_add_ph(zmm_sum_0, zmm_sum_1); + + if (lhs != last) { + __mmask32 mask = (__mmask32)((1 << (last - lhs)) - 1); + __m512i zmm_undefined = _mm512_undefined_epi32(); + zmm_sum_0 = _mm512_mask3_fmadd_ph( + _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, lhs)), + _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, rhs)), + zmm_sum_0, mask); + } + + *distance = -1 * HorizontalAdd_FP16_V512(zmm_sum_0); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::avx512_fp16 \ No newline at end of file diff --git a/src/turbo/avx512fp16/half_float/inner_product.h b/src/turbo/avx512_fp16/half_float/inner_product.h similarity index 93% rename from src/turbo/avx512fp16/half_float/inner_product.h rename to src/turbo/avx512_fp16/half_float/inner_product.h index dbd9d9f58..a80944713 100644 --- a/src/turbo/avx512fp16/half_float/inner_product.h +++ b/src/turbo/avx512_fp16/half_float/inner_product.h @@ -16,7 +16,7 @@ #include -namespace zvec::turbo::avx512fp16 { +namespace zvec::turbo::avx512_fp16 { // Compute inner product distance between a single quantized FP16 // vector pair. @@ -28,4 +28,4 @@ void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); -} // namespace zvec::turbo::avx512fp16 +} // namespace zvec::turbo::avx512_fp16 diff --git a/src/turbo/avx512_fp16/half_float/inner_product_common.h b/src/turbo/avx512_fp16/half_float/inner_product_common.h new file mode 100644 index 000000000..50c9e8053 --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/inner_product_common.h @@ -0,0 +1,61 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance +// implementations (cosine, l2, mips_l2, etc.). +// +// All functions are marked always_inline so that when this header is included +// from a per-file-march .cc translation unit, the compiler can fully inline +// and optimize them under the correct -march flag without any cross-TU call +// overhead. + +#pragma once + +#if defined(__AVX512FP16__) +#include +#include +#include + +namespace zvec::turbo::avx512_fp16::internal { + +//! Calculate Fused-Multiply-Add (AVX512FP16) +#define FMA_FP16_AVX512FP16(zmm_m, zmm_q, zmm_sum) \ + zmm_sum = _mm512_fmadd_ph(zmm_m, zmm_q, zmm_sum); + +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} + +static inline float HorizontalAdd_FP32_V512(__m512 v) { + __m256 low = _mm512_castps512_ps256(v); + __m256 high = + _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)); + return HorizontalAdd_FP32_V256(_mm256_add_ps(low, high)); +} + +static inline float HorizontalAdd_FP16_V512(__m512h v) { + __m512 low = _mm512_cvtxph_ps(_mm512_castph512_ph256(v)); + __m512 high = _mm512_cvtxph_ps( + _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(v), 1))); + + return HorizontalAdd_FP32_V512(_mm512_add_ps(low, high)); +} + +} // namespace zvec::turbo::avx512_fp16::internal + +#endif // defined(__AVX512FP16__) diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean.cc b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc new file mode 100644 index 000000000..3956fd090 --- /dev/null +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc @@ -0,0 +1,111 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__AVX512F__) +#include +#include +#include "avx512_fp16/half_float/squared_euclidean.h" +#include "avx512_fp16/half_float/squared_euclidean_common.h" + +using namespace zvec::ailego; + +using namespace zvec::turbo::avx512_fp16::internal; + +#endif + +namespace zvec::turbo::avx512_fp16 { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__AVX512FP16__) + const Float16 *lhs = reinterpret_cast(a); + const Float16 *rhs = reinterpret_cast(b); + + const Float16 *last = lhs + dim; + const Float16 *last_aligned = lhs + ((dim >> 6) << 6); + + __m512h zmm_sum_0 = _mm512_setzero_ph(); + __m512h zmm_sum_1 = _mm512_setzero_ph(); + + if (((uintptr_t)lhs & 0x3f) == 0 && ((uintptr_t)rhs & 0x3f) == 0) { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m512h zmm_d_0 = + _mm512_sub_ph(_mm512_load_ph(lhs + 0), _mm512_load_ph(rhs + 0)); + __m512h zmm_d_1 = + _mm512_sub_ph(_mm512_load_ph(lhs + 32), _mm512_load_ph(rhs + 32)); + zmm_sum_0 = _mm512_fmadd_ph(zmm_d_0, zmm_d_0, zmm_sum_0); + zmm_sum_1 = _mm512_fmadd_ph(zmm_d_1, zmm_d_1, zmm_sum_1); + } + + if (last >= last_aligned + 32) { + __m512h zmm_d = _mm512_sub_ph(_mm512_load_ph(lhs), _mm512_load_ph(rhs)); + zmm_sum_0 = _mm512_fmadd_ph(zmm_d, zmm_d, zmm_sum_0); + lhs += 32; + rhs += 32; + } + } else { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m512h zmm_d_0 = + _mm512_sub_ph(_mm512_loadu_ph(lhs + 0), _mm512_loadu_ph(rhs + 0)); + __m512h zmm_d_1 = + _mm512_sub_ph(_mm512_loadu_ph(lhs + 32), _mm512_loadu_ph(rhs + 32)); + zmm_sum_0 = _mm512_fmadd_ph(zmm_d_0, zmm_d_0, zmm_sum_0); + zmm_sum_1 = _mm512_fmadd_ph(zmm_d_1, zmm_d_1, zmm_sum_1); + } + + if (last >= last_aligned + 32) { + __m512h zmm_d = _mm512_sub_ph(_mm512_loadu_ph(lhs), _mm512_loadu_ph(rhs)); + zmm_sum_0 = _mm512_fmadd_ph(zmm_d, zmm_d, zmm_sum_0); + lhs += 32; + rhs += 32; + } + } + + zmm_sum_0 = _mm512_add_ph(zmm_sum_0, zmm_sum_1); + if (lhs != last) { + __mmask32 mask = (__mmask32)((1 << (last - lhs)) - 1); + __m512i zmm_undefined = _mm512_undefined_epi32(); + __m512h zmm_undefined_ph = _mm512_undefined_ph(); + __m512h zmm_d = _mm512_mask_sub_ph( + zmm_undefined_ph, mask, + _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, lhs)), + _mm512_castsi512_ph(_mm512_mask_loadu_epi16(zmm_undefined, mask, rhs))); + zmm_sum_0 = _mm512_mask3_fmadd_ph(zmm_d, zmm_d, zmm_sum_0, mask); + } + + *distance = HorizontalAdd_FP16_V512(zmm_sum_0); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __AVX512F__ +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__AVX512FP16__) +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__AVX512F__ +} + +} // namespace zvec::turbo::avx512_fp16 \ No newline at end of file diff --git a/src/turbo/avx512fp16/half_float/squared_euclidean.h b/src/turbo/avx512_fp16/half_float/squared_euclidean.h similarity index 93% rename from src/turbo/avx512fp16/half_float/squared_euclidean.h rename to src/turbo/avx512_fp16/half_float/squared_euclidean.h index f3a13d3d2..b78d5ab8d 100644 --- a/src/turbo/avx512fp16/half_float/squared_euclidean.h +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean.h @@ -16,7 +16,7 @@ #include -namespace zvec::turbo::avx512fp16 { +namespace zvec::turbo::avx512_fp16 { // Compute squared euclidean distance between a single quantized FP32 // vector pair. @@ -28,4 +28,4 @@ void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); -} // namespace zvec::turbo::avx512fp16 +} // namespace zvec::turbo::avx512_fp16 diff --git a/src/turbo/avx512fp16/half_float/common.h b/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h similarity index 55% rename from src/turbo/avx512fp16/half_float/common.h rename to src/turbo/avx512_fp16/half_float/squared_euclidean_common.h index da0574085..c769b067f 100644 --- a/src/turbo/avx512fp16/half_float/common.h +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h @@ -27,9 +27,31 @@ #include #include -namespace zvec::turbo::avx512fp16::internal { +namespace zvec::turbo::avx512_fp16::internal { +static inline float HorizontalAdd_FP32_V256(__m256 v) { + __m256 x1 = _mm256_hadd_ps(v, v); + __m256 x2 = _mm256_hadd_ps(x1, x1); + __m128 x3 = _mm256_extractf128_ps(x2, 1); + __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); + return _mm_cvtss_f32(x4); +} -} // namespace zvec::turbo::avx512fp16::internal +static inline float HorizontalAdd_FP32_V512(__m512 v) { + __m256 low = _mm512_castps512_ps256(v); + __m256 high = + _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)); + return HorizontalAdd_FP32_V256(_mm256_add_ps(low, high)); +} + +static inline float HorizontalAdd_FP16_V512(__m512h v) { + __m512 low = _mm512_cvtxph_ps(_mm512_castph512_ph256(v)); + __m512 high = _mm512_cvtxph_ps( + _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(v), 1))); + + return HorizontalAdd_FP32_V512(_mm512_add_ps(low, high)); +} + +} // namespace zvec::turbo::avx512_fp16::internal #endif // defined(__AVX512FP16__) diff --git a/src/turbo/avx512fp16/half_float/inner_product.cc b/src/turbo/avx512fp16/half_float/inner_product.cc deleted file mode 100644 index 1b2870c54..000000000 --- a/src/turbo/avx512fp16/half_float/inner_product.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025-present the zvec project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "avx512fp16/half_float/inner_product.h" -#include "avx512fp16/half_float/common.h" - -#if defined(__AVX512FP16__) -#include -#endif - -namespace zvec::turbo::avx512fp16 { - -// Compute squared Euclidean distance between a single quantized FP16 -// vector pair. -void inner_product_fp16_distance(const void *a, const void *b, size_t dim, - float *distance) { - (void)a; - (void)b; - (void)dim; - (void)distance; -} - -// Batch version of inner_product_fp16_distance. -void inner_product_fp16_batch_distance(const void *const *vectors, - const void *query, size_t n, size_t dim, - float *distances) { - (void)vectors; - (void)query; - (void)n; - (void)dim; - (void)distances; -} - -} // namespace zvec::turbo::avx512fp16 \ No newline at end of file diff --git a/src/turbo/avx512fp16/half_float/squared_euclidean.cc b/src/turbo/avx512fp16/half_float/squared_euclidean.cc deleted file mode 100644 index cefd49b97..000000000 --- a/src/turbo/avx512fp16/half_float/squared_euclidean.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025-present the zvec project -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "avx512fp16/half_float/squared_euclidean.h" -#include "avx512fp16/half_float/common.h" - -#if defined(__AVX512F__) -#include -#endif - -namespace zvec::turbo::avx512fp16 { - -void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, - float *distance) { -#if defined(__AVX512FP16__) - -#else - (void)a; - (void)b; - (void)dim; - (void)distance; -#endif // __AVX512F__ -} - -void squared_euclidean_fp32_batch_distance(const void *const *vectors, - const void *query, size_t n, - size_t dim, float *distances) { -#if defined(__AVX512FP16__) -#else - (void)vectors; - (void)query; - (void)n; - (void)dim; - (void)distances; -#endif //__AVX512F__ -} - -} // namespace zvec::turbo::avx512fp16 \ No newline at end of file diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index 97d8b1fed..0fe3fe024 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -32,11 +32,11 @@ #include "avx512/half_float/cosine.h" #include "avx512/half_float/inner_product.h" #include "avx512/half_float/squared_euclidean.h" +#include "avx512_fp16/half_float/cosine.h" +#include "avx512_fp16/half_float/inner_product.h" +#include "avx512_fp16/half_float/squared_euclidean.h" #include "avx512_vnni/record_quantized_int8/cosine.h" #include "avx512_vnni/record_quantized_int8/squared_euclidean.h" -#include "avx512fp16/half_float/cosine.h" -#include "avx512fp16/half_float/inner_product.h" -#include "avx512fp16/half_float/squared_euclidean.h" #include "scalar/float32/cosine.h" #include "scalar/float32/inner_product.h" #include "scalar/float32/squared_euclidean.h" @@ -209,7 +209,13 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, (cpu_arch_type == CpuArchType::kAuto || cpu_arch_type == CpuArchType::kAVX512FP16)) { if (metric_type == MetricType::kInnerProduct) { - return avx512fp16::inner_product_fp16_distance; + return avx512_fp16::inner_product_fp16_distance; + } + if (metric_type == MetricType::kCosine) { + return avx512_fp16::cosine_fp16_distance; + } + if (metric_type == MetricType::kInnerProduct) { + return avx512_fp16::inner_product_fp16_distance; } } diff --git a/tests/turbo/turbo_inner_product_test.cc b/tests/turbo/turbo_inner_product_test.cc index f616d9d6f..9b90675fe 100644 --- a/tests/turbo/turbo_inner_product_test.cc +++ b/tests/turbo/turbo_inner_product_test.cc @@ -62,8 +62,9 @@ TEST(InnerProductMetric, TestFp32InnerProduct) { func_avx(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx); - ASSERT_NEAR(score_scalar, score_avx512, 0.001); - ASSERT_NEAR(score_scalar, score_avx, 0.001); + float epsilon = 0.001; + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); } } @@ -141,8 +142,9 @@ TEST(InnerProductMetric, TestFp16InnerProduct) { func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_scalar); - ASSERT_NEAR(score_scalar, score_avx512fp16, 0.001); - ASSERT_NEAR(score_scalar, score_avx512, 0.001); - ASSERT_NEAR(score_scalar, score_avx, 0.001); + float epsilon = 0.01; + ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon); + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); } } From 950c7fd143eddf5a78d00c8987013b8016c011f8 Mon Sep 17 00:00:00 2001 From: ray Date: Thu, 2 Apr 2026 18:28:19 +0800 Subject: [PATCH 24/44] feat: add cosine and euclidean dist func --- src/turbo/avx/half_float/cosine.cc | 2 +- tests/turbo/turbo_cosine_test.cc | 155 +++++++++++++++++++++++++++- tests/turbo/turbo_euclidean_test.cc | 131 ++++++++++++++++++++++- 3 files changed, 281 insertions(+), 7 deletions(-) diff --git a/src/turbo/avx/half_float/cosine.cc b/src/turbo/avx/half_float/cosine.cc index 40ac05853..3500907ac 100644 --- a/src/turbo/avx/half_float/cosine.cc +++ b/src/turbo/avx/half_float/cosine.cc @@ -29,7 +29,7 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, size_t d = dim - extra_dim; float ip; - cosine_fp16_distance(a, b, d, &ip); + inner_product_fp16_distance(a, b, d, &ip); *distance = 1 - ip; #else diff --git a/tests/turbo/turbo_cosine_test.cc b/tests/turbo/turbo_cosine_test.cc index 83debae27..77622afa6 100644 --- a/tests/turbo/turbo_cosine_test.cc +++ b/tests/turbo/turbo_cosine_test.cc @@ -11,16 +11,163 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include -#include #include -#include +#include +#include #include "zvec/core/framework/index_factory.h" using namespace zvec; using namespace zvec::core; using namespace zvec::ailego; -TEST(CosineMetric, TestFp32Cosine) {} +// Target Test Type: avx, avx512, scalar +TEST(CosineMetric, TestFp32Cosine) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1000; + + auto converter = IndexFactory::CreateConverter("CosineFp32Converter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_scalar{0.0f}; + float score_avx{0.0f}; + float score_avx512{0.0f}; + + func_scalar(doc_vec.data(), query_vec.data(), DIMENSION, &score_scalar); + + func_avx512(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx512); + + func_avx(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx); + + float epsilon = 0.001; + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(CosineMetric, TestFp16Cosine) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1000; + + auto converter = IndexFactory::CreateConverter("CosineFp16Converter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_avx512fp16 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_avx512fp16{0.0f}; + float score_avx512{0.0f}; + float score_avx{0.0f}; + float score_scalar{0.0f}; + + func_avx512fp16(doc_out.data(), query_out.data(), + qmeta_reformer.dimension(), &score_avx512fp16); + + func_avx512(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx512); + + func_avx(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); -TEST(CosineMetric, TestFp16Cosine) {} + float epsilon = 0.01; + ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon); + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); + } +} diff --git a/tests/turbo/turbo_euclidean_test.cc b/tests/turbo/turbo_euclidean_test.cc index 016cdc585..7a154ecc6 100644 --- a/tests/turbo/turbo_euclidean_test.cc +++ b/tests/turbo/turbo_euclidean_test.cc @@ -13,11 +13,138 @@ // limitations under the License. #include #include +#include +#include #include "zvec/core/framework/index_factory.h" using namespace zvec; using namespace zvec::core; +using namespace zvec::ailego; -TEST(SquaredEuclideanMetric, TestFp32SquaredEuclidean) {} +// Target Test Type: avx, avx512, scalar +TEST(SquaredEuclideanMetric, TestFp32SquaredEuclidean) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); -TEST(SquaredEuclideanMetric, TestFp16SquaredEuclidean) {} + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1000; + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + float score_scalar{0.0f}; + float score_avx{0.0f}; + float score_avx512{0.0f}; + + func_scalar(doc_vec.data(), query_vec.data(), DIMENSION, &score_scalar); + + func_avx512(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx512); + + func_avx(doc_vec.data(), query_vec.data(), DIMENSION, &score_avx); + + float epsilon = 0.001; + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(SquaredEuclideanMetric, TestFp16SquaredEuclidean) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1000; + + auto converter = IndexFactory::CreateConverter("HalfFloatConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto func_avx512fp16 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto func_avx512 = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto func_avx = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto func_scalar = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + float score_avx512fp16{0.0f}; + float score_avx512{0.0f}; + float score_avx{0.0f}; + float score_scalar{0.0f}; + + func_avx512fp16(doc_out.data(), query_out.data(), + qmeta_reformer.dimension(), &score_avx512fp16); + + func_avx512(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx512); + + func_avx(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_avx); + + func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), + &score_scalar); + + float epsilon = 0.01; + ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon); + ASSERT_NEAR(score_scalar, score_avx512, epsilon); + ASSERT_NEAR(score_scalar, score_avx, epsilon); + } +} From 000a1991507a49b11ce3e95a6a3ae266df04dbd4 Mon Sep 17 00:00:00 2001 From: ray Date: Thu, 9 Apr 2026 16:40:06 +0800 Subject: [PATCH 25/44] refactor: change makefile --- src/turbo/CMakeLists.txt | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index 4a0443a31..767e81daa 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -14,44 +14,32 @@ endif() file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h) if(NOT ANDROID AND AUTO_DETECT_ARCH) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") - file(GLOB_RECURSE AVX512_VNNI_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512_fp16/*.cc) + if (HOST_ARCH MATCHES "^(x86|x64)$") + file(GLOB_RECURSE AVX512_AVX512FP16_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512_fp16/*.cc) set_source_files_properties( - ${AVX512_VNNI_SRCS} + ${AVX512_AVX512FP16_SRCS} PROPERTIES COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512FP16}" ) - endif() -endif() -# Set per-file compile flags for AVX512-VNNI sources. -# set_source_files_properties is directory-scoped, so it must be called in the -# same directory that adds the sources to a target (i.e. here, not in a -# subdirectory). -if(NOT ANDROID AND AUTO_DETECT_ARCH) - if (HOST_ARCH MATCHES "^(x86|x64)$") + # Set per-file compile flags for AVX512-VNNI sources. + # set_source_files_properties is directory-scoped, so it must be called in the + # same directory that adds the sources to a target (i.e. here, not in a + # subdirectory). file(GLOB_RECURSE AVX512_VNNI_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512_vnni/*.cc) set_source_files_properties( ${AVX512_VNNI_SRCS} PROPERTIES COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}" ) - endif() -endif() -if(NOT ANDROID AND AUTO_DETECT_ARCH) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") file(GLOB_RECURSE AVX512_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512/*.cc) set_source_files_properties( ${AVX512_SRCS} PROPERTIES COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}" ) - endif() -endif() - -if(NOT ANDROID AND AUTO_DETECT_ARCH) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") + file(GLOB_RECURSE AVX2_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx2/*.cc) file(GLOB_RECURSE AVX2_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx/*.cc) set_source_files_properties( @@ -59,12 +47,7 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH) PROPERTIES COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX2}" ) - endif() -endif() - -if(NOT ANDROID AND AUTO_DETECT_ARCH) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") file(GLOB_RECURSE SSE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/sse/*.cc) set_source_files_properties( ${SSE_SRCS} From 27ec0f0fb9c8692f6b1cb4c121a6d6b9b69e1eeb Mon Sep 17 00:00:00 2001 From: ray Date: Thu, 9 Apr 2026 17:19:12 +0800 Subject: [PATCH 26/44] refactor: change makefile --- src/turbo/CMakeLists.txt | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index 767e81daa..eae831309 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -15,7 +15,9 @@ file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h) if(NOT ANDROID AND AUTO_DETECT_ARCH) if (HOST_ARCH MATCHES "^(x86|x64)$") - file(GLOB_RECURSE AVX512_AVX512FP16_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512_fp16/*.cc) + file(GLOB_RECURSE AVX512_AVX512FP16_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/avx512_fp16/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/avx512_fp16/*.c) set_source_files_properties( ${AVX512_AVX512FP16_SRCS} PROPERTIES @@ -26,29 +28,38 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH) # set_source_files_properties is directory-scoped, so it must be called in the # same directory that adds the sources to a target (i.e. here, not in a # subdirectory). - file(GLOB_RECURSE AVX512_VNNI_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512_vnni/*.cc) + file(GLOB_RECURSE AVX512_VNNI_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/avx512_vnni/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/avx512_vnni/*.c) set_source_files_properties( ${AVX512_VNNI_SRCS} PROPERTIES COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}" ) - file(GLOB_RECURSE AVX512_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx512/*.cc) + file(GLOB_RECURSE AVX512_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/avx512/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/avx512/*.c) set_source_files_properties( ${AVX512_SRCS} PROPERTIES COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}" ) - file(GLOB_RECURSE AVX2_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx2/*.cc) - file(GLOB_RECURSE AVX2_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/avx/*.cc) + file(GLOB_RECURSE AVX2_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/avx2/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/avx2/*.c + ${CMAKE_CURRENT_SOURCE_DIR}/avx/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/avx/*.c) set_source_files_properties( ${AVX2_SRCS} PROPERTIES COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX2}" ) - file(GLOB_RECURSE SSE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/sse/*.cc) + file(GLOB_RECURSE SSE_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/sse/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/sse/*.c) set_source_files_properties( ${SSE_SRCS} PROPERTIES From 08d995e6fd217771bacf2c9f028585d77df5094a Mon Sep 17 00:00:00 2001 From: ray Date: Fri, 10 Apr 2026 16:19:02 +0800 Subject: [PATCH 27/44] fix: fix single dist --- .../avx2/record_quantized_int4/cosine.cc | 46 +++++------- .../avx2/record_quantized_int8/cosine.cc | 21 ++++++ .../scalar/record_quantized_int4/common.h | 2 +- .../scalar/record_quantized_int4/cosine.cc | 29 ++++++-- .../scalar/record_quantized_int8/cosine.cc | 11 ++- .../squared_euclidean.cc | 1 + src/turbo/sse/record_quantized_int4/cosine.cc | 32 +++++++-- src/turbo/sse/record_quantized_int8/cosine.cc | 21 ++++++ tests/turbo/turbo_quantized_integer_test.cc | 71 ++++++++++++++++--- 9 files changed, 180 insertions(+), 54 deletions(-) diff --git a/src/turbo/avx2/record_quantized_int4/cosine.cc b/src/turbo/avx2/record_quantized_int4/cosine.cc index f83c7358c..21e05b2c0 100644 --- a/src/turbo/avx2/record_quantized_int4/cosine.cc +++ b/src/turbo/avx2/record_quantized_int4/cosine.cc @@ -23,7 +23,8 @@ namespace zvec::turbo::avx2 { void cosine_int4_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX2__) - const int original_dim = dim - 24; + const int d = dim - 40; + const size_t original_dim = d >> 1; if (original_dim <= 0) { return; } @@ -31,23 +32,20 @@ void cosine_int4_distance(const void *a, const void *b, size_t dim, internal::inner_product_int4_avx2(a, b, original_dim, distance); const float *a_tail = reinterpret_cast( - reinterpret_cast(a) + original_dim); + reinterpret_cast(a) + original_dim); const float *b_tail = reinterpret_cast( - reinterpret_cast(b) + original_dim); + reinterpret_cast(b) + original_dim); - float ma = a_tail[0]; - float mb = a_tail[1]; - float ms = a_tail[2]; + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; - float qa = b_tail[0]; - float qb = b_tail[1]; - float qs = b_tail[2]; + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; - // Dequantize and compute cosine distance: - // cosine_dist = -(ma * qa * ip + mb * qa * qs + qb * ma * ms - // + original_dim * qb * mb) *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + - static_cast(original_dim) * qb * mb); + static_cast(d) * qb * mb); #else (void)a; (void)b; @@ -59,8 +57,8 @@ void cosine_int4_distance(const void *a, const void *b, size_t dim, void cosine_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX2__) - // `dim` is the full encoded size; the original vector occupies dim-24 bytes. - const int original_dim = dim - 24; + const int d = dim - 40; + const size_t original_dim = d >> 1; if (original_dim <= 0) { return; } @@ -69,31 +67,21 @@ void cosine_int4_batch_distance(const void *const *vectors, const void *query, distances); const float *q_tail = reinterpret_cast( - reinterpret_cast(query) + original_dim); + reinterpret_cast(query) + original_dim); float qa = q_tail[0]; float qb = q_tail[1]; float qs = q_tail[2]; for (int i = 0; i < n; ++i) { const float *m_tail = reinterpret_cast( - reinterpret_cast(vectors[i]) + original_dim); + reinterpret_cast(vectors[i]) + original_dim); float ma = m_tail[0]; float mb = m_tail[1]; float ms = m_tail[2]; - // Correct for the +128 shift applied to the query during preprocessing: - // dpbusd computes sum(uint8_query[i] * int8_data[i]) - // = sum((int8_query[i] + 128) * int8_data[i]) - // = true_ip + 128 * sum(int8_data[i]) - // int8_sum is stored as the 5th int-sized field after the 4 floats. - int int8_sum = reinterpret_cast(m_tail)[4]; - float &result = distances[i]; - result -= 128.0f * static_cast(int8_sum); - // Dequantize and compute cosine distance: - // cosine_dist = -(ma * qa * ip + mb * qa * qs + qb * ma * ms - // + original_dim * qb * mb) + float &result = distances[i]; result = -(ma * qa * result + mb * qa * qs + qb * ma * ms + - static_cast(original_dim) * qb * mb); + static_cast(d) * qb * mb); } #else (void)vectors; diff --git a/src/turbo/avx2/record_quantized_int8/cosine.cc b/src/turbo/avx2/record_quantized_int8/cosine.cc index 5486a52a6..b31df0a13 100644 --- a/src/turbo/avx2/record_quantized_int8/cosine.cc +++ b/src/turbo/avx2/record_quantized_int8/cosine.cc @@ -23,7 +23,28 @@ namespace zvec::turbo::avx2 { void cosine_int8_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX2__) + const int original_dim = dim - 24; + if (original_dim <= 0) { + return; + } + internal::inner_product_int8_avx2(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + static_cast(original_dim) * qb * mb); #else (void)a; (void)b; diff --git a/src/turbo/scalar/record_quantized_int4/common.h b/src/turbo/scalar/record_quantized_int4/common.h index 32ea1408e..1e81dccd5 100644 --- a/src/turbo/scalar/record_quantized_int4/common.h +++ b/src/turbo/scalar/record_quantized_int4/common.h @@ -61,7 +61,7 @@ static __attribute__((always_inline)) void inner_product_int4_scalar( Int4MulTable[((m_val >> 0) & 0xf0) | ((q_val >> 4) & 0xf)]; } - *distance = -sum; + *distance = sum; } } // namespace zvec::turbo::scalar::internal \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/cosine.cc b/src/turbo/scalar/record_quantized_int4/cosine.cc index ad6105d31..ff4e7d9c4 100644 --- a/src/turbo/scalar/record_quantized_int4/cosine.cc +++ b/src/turbo/scalar/record_quantized_int4/cosine.cc @@ -19,10 +19,31 @@ namespace zvec::turbo::scalar { void cosine_int4_distance(const void *a, const void *b, size_t dim, float *distance) { - (void)a; - (void)b; - (void)dim; - (void)distance; + const int d = dim - 40; + const size_t original_dim = d >> 1; + + if (original_dim <= 0) { + return; + } + + internal::inner_product_int4_scalar(a, b, original_dim, distance); + *distance = -*distance; + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + static_cast(d) * qb * mb); } void cosine_int4_batch_distance(const void *const *vectors, const void *query, diff --git a/src/turbo/scalar/record_quantized_int8/cosine.cc b/src/turbo/scalar/record_quantized_int8/cosine.cc index e6a7fe170..a18403f3e 100644 --- a/src/turbo/scalar/record_quantized_int8/cosine.cc +++ b/src/turbo/scalar/record_quantized_int8/cosine.cc @@ -15,25 +15,24 @@ #include "scalar/record_quantized_int8/cosine.h" #include #include "scalar/record_quantized_int8/common.h" -#include "scalar/record_quantized_int8/inner_product.h" namespace zvec::turbo::scalar { void cosine_int8_distance(const void *a, const void *b, size_t dim, float *distance) { - const size_t original_dim = dim - 20; + const int original_dim = dim - 24; if (original_dim <= 0) { return; } - zvec::turbo::scalar::inner_product_int8_distance(a, b, original_dim, - distance); + internal::inner_product_int8_scalar(a, b, original_dim, distance); + *distance = -*distance; const float *a_tail = reinterpret_cast( - reinterpret_cast(a) + original_dim); + reinterpret_cast(a) + original_dim); const float *b_tail = reinterpret_cast( - reinterpret_cast(b) + original_dim); + reinterpret_cast(b) + original_dim); float qa = a_tail[0]; float qb = a_tail[1]; diff --git a/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc index 82d5180c9..4da173c33 100644 --- a/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc +++ b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc @@ -25,6 +25,7 @@ void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, } internal::inner_product_int8_scalar(a, b, original_dim, distance); + *distance = -*distance; const float *a_tail = reinterpret_cast( reinterpret_cast(a) + original_dim); diff --git a/src/turbo/sse/record_quantized_int4/cosine.cc b/src/turbo/sse/record_quantized_int4/cosine.cc index 2a87508f5..5751e511d 100644 --- a/src/turbo/sse/record_quantized_int4/cosine.cc +++ b/src/turbo/sse/record_quantized_int4/cosine.cc @@ -14,7 +14,7 @@ #include "sse/record_quantized_int4/cosine.h" #include "sse/record_quantized_int4/common.h" -#if defined(__SSE__) +#if defined(__SSE4_1__) #include #endif @@ -22,19 +22,41 @@ namespace zvec::turbo::sse { void cosine_int4_distance(const void *a, const void *b, size_t dim, float *distance) { -#if defined(__SSE__) +#if defined(__SSE4_1__) + const int d = dim - 40; + const size_t original_dim = d >> 1; + if (original_dim <= 0) { + return; + } + internal::inner_product_int4_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + static_cast(d) * qb * mb); #else (void)a; (void)b; (void)dim; (void)distance; -#endif // __SSE__ +#endif // __SSE4_1__ } void cosine_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { -#if defined(__SSE__) +#if defined(__SSE4_1__) #else (void)vectors; @@ -42,7 +64,7 @@ void cosine_int4_batch_distance(const void *const *vectors, const void *query, (void)n; (void)dim; (void)distances; -#endif //__SSE__ +#endif //__SSE4_1__ } } // namespace zvec::turbo::sse \ No newline at end of file diff --git a/src/turbo/sse/record_quantized_int8/cosine.cc b/src/turbo/sse/record_quantized_int8/cosine.cc index dabff9f71..879cf9c99 100644 --- a/src/turbo/sse/record_quantized_int8/cosine.cc +++ b/src/turbo/sse/record_quantized_int8/cosine.cc @@ -24,7 +24,28 @@ namespace zvec::turbo::sse { void cosine_int8_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__SSE__) + const int original_dim = dim - 24; + if (original_dim <= 0) { + return; + } + internal::inner_product_int8_sse(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + static_cast(original_dim) * qb * mb); #else (void)a; (void)b; diff --git a/tests/turbo/turbo_quantized_integer_test.cc b/tests/turbo/turbo_quantized_integer_test.cc index 2419eb7cb..0202acd1b 100644 --- a/tests/turbo/turbo_quantized_integer_test.cc +++ b/tests/turbo/turbo_quantized_integer_test.cc @@ -41,11 +41,16 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { ASSERT_EQ(0u, converter->init(meta, Params())); auto &convert_meta = converter->meta(); auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); auto func_float32 = turbo::get_distance_func( turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + auto func_avx512vnni = turbo::get_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI); + auto func_avx2 = turbo::get_distance_func( turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); @@ -85,6 +90,7 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { float score_float32{0.0f}; float score_scalar{0.0f}; + float score_avx512vnni{0.0f}; float score_avx2{0.0f}; float score_sse{0.0f}; @@ -93,12 +99,16 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_scalar); + func_avx512vnni(doc_out.data(), query_out.data(), + qmeta_reformer.dimension(), &score_avx512vnni); + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_avx2); func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_sse); + ASSERT_NEAR(score_float32, score_avx512vnni, 0.2 * DIMENSION); ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); @@ -122,6 +132,7 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { ASSERT_EQ(0u, converter->init(meta, Params())); auto &convert_meta = converter->meta(); auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); auto func_float32 = turbo::get_distance_func( turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, @@ -198,10 +209,12 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); ASSERT_TRUE(!!converter); ASSERT_EQ(0u, converter->init(meta, Params())); auto &convert_meta = converter->meta(); auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); auto func_float32 = turbo::get_distance_func( turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, @@ -278,10 +291,12 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); ASSERT_TRUE(!!converter); ASSERT_EQ(0u, converter->init(meta, Params())); auto &convert_meta = converter->meta(); auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); auto func_float32 = turbo::get_distance_func( turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, @@ -367,6 +382,7 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) { auto &fp32_convert_meta = fp32_converter->meta(); auto fp32_reformer = IndexFactory::CreateReformer(fp32_convert_meta.reformer_name()); + ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params())); // int8 converter auto converter = IndexFactory::CreateConverter("CosineInt8Converter"); @@ -375,11 +391,16 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) { auto &convert_meta = converter->meta(); auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); auto func_float32 = turbo::get_distance_func( turbo::MetricType::kCosine, turbo::DataType::kFp32, turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + auto func_avx512vnni = turbo::get_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI); + auto func_avx2 = turbo::get_distance_func( turbo::MetricType::kCosine, turbo::DataType::kInt8, turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); @@ -409,6 +430,7 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) { float score_float32{0.0f}; float score_scalar{0.0f}; + float score_avx512vnni{0.0f}; float score_avx2{0.0f}; float score_sse{0.0f}; @@ -441,12 +463,16 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) { func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_scalar); + func_avx512vnni(doc_out.data(), query_out.data(), + qmeta_reformer.dimension(), &score_avx512vnni); + func_avx2(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_avx2); func_sse(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_sse); + ASSERT_NEAR(score_float32, score_avx512vnni, 0.2 * DIMENSION); ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); @@ -463,13 +489,26 @@ TEST(QuantizedIntegerMetric, TestInt4Cosine) { const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; const size_t COUNT = 1000; - auto converter = IndexFactory::CreateConverter("CosineInt4Converter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); - meta.set_metric("InnerProduct", 0, Params()); + meta.set_metric("Cosine", 0, Params()); + + // fp32 converter + auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter"); + ASSERT_TRUE(!!fp32_converter); + ASSERT_EQ(0u, fp32_converter->init(meta, Params())); + + auto &fp32_convert_meta = fp32_converter->meta(); + auto fp32_reformer = + IndexFactory::CreateReformer(fp32_convert_meta.reformer_name()); + ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params())); + + // int4 converter + auto converter = IndexFactory::CreateConverter("CosineInt4Converter"); ASSERT_TRUE(!!converter); ASSERT_EQ(0u, converter->init(meta, Params())); auto &convert_meta = converter->meta(); auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); auto func_float32 = turbo::get_distance_func( turbo::MetricType::kCosine, turbo::DataType::kFp32, @@ -500,6 +539,27 @@ TEST(QuantizedIntegerMetric, TestInt4Cosine) { IndexQueryMeta qmeta; qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta fp32_qmeta_reformer; + + float score_float32{0.0f}; + float score_scalar{0.0f}; + float score_avx2{0.0f}; + float score_sse{0.0f}; + + std::string fp32_query_out; + ASSERT_EQ(0, + fp32_reformer->transform(query_vec.data(), qmeta, &fp32_query_out, + &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + std::string fp32_doc_out; + ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out, + &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + func_float32(fp32_query_out.data(), fp32_doc_out.data(), + fp32_qmeta_reformer.dimension(), &score_float32); + IndexQueryMeta qmeta_reformer; std::string query_out; @@ -512,13 +572,6 @@ TEST(QuantizedIntegerMetric, TestInt4Cosine) { &qmeta_reformer)); ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - float score_float32{0.0f}; - float score_scalar{0.0f}; - float score_avx2{0.0f}; - float score_sse{0.0f}; - - func_float32(query_vec.data(), doc_vec.data(), DIMENSION, &score_float32); - func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_scalar); From b4f4bdcb4f87415460b890bcc38a4438b4d03fed Mon Sep 17 00:00:00 2001 From: ray Date: Fri, 10 Apr 2026 16:48:49 +0800 Subject: [PATCH 28/44] fix: fix single dist --- .../scalar/record_quantized_int4/common.h | 2 +- .../scalar/record_quantized_int4/cosine.cc | 1 - tests/turbo/turbo_quantized_integer_test.cc | 18 +++++++++--------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/turbo/scalar/record_quantized_int4/common.h b/src/turbo/scalar/record_quantized_int4/common.h index 1e81dccd5..4257a66ed 100644 --- a/src/turbo/scalar/record_quantized_int4/common.h +++ b/src/turbo/scalar/record_quantized_int4/common.h @@ -54,7 +54,7 @@ static __attribute__((always_inline)) void inner_product_int4_scalar( const uint8_t *q = reinterpret_cast(b); float sum = 0.0; - for (size_t i = 0; i < (dim >> 1); ++i) { + for (size_t i = 0; i < dim; ++i) { uint8_t m_val = m[i]; uint8_t q_val = q[i]; sum += Int4MulTable[((m_val << 4) & 0xf0) | ((q_val >> 0) & 0xf)] + diff --git a/src/turbo/scalar/record_quantized_int4/cosine.cc b/src/turbo/scalar/record_quantized_int4/cosine.cc index ff4e7d9c4..b4c516fde 100644 --- a/src/turbo/scalar/record_quantized_int4/cosine.cc +++ b/src/turbo/scalar/record_quantized_int4/cosine.cc @@ -27,7 +27,6 @@ void cosine_int4_distance(const void *a, const void *b, size_t dim, } internal::inner_product_int4_scalar(a, b, original_dim, distance); - *distance = -*distance; const float *a_tail = reinterpret_cast( reinterpret_cast(a) + original_dim); diff --git a/tests/turbo/turbo_quantized_integer_test.cc b/tests/turbo/turbo_quantized_integer_test.cc index 0202acd1b..252b2e278 100644 --- a/tests/turbo/turbo_quantized_integer_test.cc +++ b/tests/turbo/turbo_quantized_integer_test.cc @@ -193,9 +193,9 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); - // ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); - // ASSERT_NEAR(score_scalar, score_avx2, 0.001); - // ASSERT_NEAR(score_scalar, score_sse, 0.001); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); } } @@ -357,9 +357,9 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); - // ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); - // ASSERT_NEAR(score_scalar, score_avx2, 0.001); - // ASSERT_NEAR(score_scalar, score_sse, 0.001); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); } } @@ -583,8 +583,8 @@ TEST(QuantizedIntegerMetric, TestInt4Cosine) { ASSERT_NEAR(score_float32, score_avx2, 0.2 * DIMENSION); ASSERT_NEAR(score_float32, score_sse, 0.2 * DIMENSION); - // ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); - // ASSERT_NEAR(score_scalar, score_avx2, 0.001); - // ASSERT_NEAR(score_scalar, score_sse, 0.001); + ASSERT_NEAR(score_float32, score_scalar, 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar, score_avx2, 0.001); + ASSERT_NEAR(score_scalar, score_sse, 0.001); } } From 97455f6ecd698aa628dc019d2b4376d65a286e94 Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 13 Apr 2026 12:35:58 +0800 Subject: [PATCH 29/44] fix: avx512fp16 dist func --- .../half_float/squared_euclidean.cc | 2 +- .../half_float/squared_euclidean.h | 4 +- src/turbo/turbo.cc | 55 ++++++++++++++++++- tests/turbo/turbo_cosine_test.cc | 2 +- tests/turbo/turbo_euclidean_test.cc | 2 +- tests/turbo/turbo_inner_product_test.cc | 2 +- 6 files changed, 59 insertions(+), 8 deletions(-) diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean.cc b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc index 3956fd090..d3fb56587 100644 --- a/src/turbo/avx512_fp16/half_float/squared_euclidean.cc +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc @@ -28,7 +28,7 @@ using namespace zvec::turbo::avx512_fp16::internal; namespace zvec::turbo::avx512_fp16 { -void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__AVX512FP16__) const Float16 *lhs = reinterpret_cast(a); diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean.h b/src/turbo/avx512_fp16/half_float/squared_euclidean.h index b78d5ab8d..669749f51 100644 --- a/src/turbo/avx512_fp16/half_float/squared_euclidean.h +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean.h @@ -20,11 +20,11 @@ namespace zvec::turbo::avx512_fp16 { // Compute squared euclidean distance between a single quantized FP32 // vector pair. -void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, float *distance); // Batch version of squared euclidean FP32. -void squared_euclidean_fp32_batch_distance(const void *const *vectors, +void squared_euclidean_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index 0fe3fe024..d06b96b1e 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -61,6 +61,55 @@ namespace zvec::turbo { DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, QuantizeType quantize_type, CpuArchType cpu_arch_type) { +#if defined(__ARM_NEON) + // INT8 + if (data_type == DataType::kInt8) { + if (metric_type == MetricType::kSquaredEuclidean) { + } + + if (metric_type == MetricType::kCosine) { + } + + if (metric_type == MetricType::kInnerProduct) { + } + } + + // INT$ + if (data_type == DataType::kInt4) { + if (metric_type == MetricType::kSquaredEuclidean) { + } + + if (metric_type == MetricType::kCosine) { + } + + if (metric_type == MetricType::kInnerProduct) { + } + } + + // FP32 + if (data_type == DataType::kFp32) { + if (metric_type == MetricType::kSquaredEuclidean) { + } + + if (metric_type == MetricType::kCosine) { + } + + if (metric_type == MetricType::kInnerProduct) { + } + } + + // FP16 + if (data_type == DataType::kFp16) { + if (metric_type == MetricType::kSquaredEuclidean) { + } + + if (metric_type == MetricType::kCosine) { + } + + if (metric_type == MetricType::kInnerProduct) { + } + } +#else // INT8 if (data_type == DataType::kInt8) { if (quantize_type == QuantizeType::kDefault) { @@ -214,8 +263,8 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, if (metric_type == MetricType::kCosine) { return avx512_fp16::cosine_fp16_distance; } - if (metric_type == MetricType::kInnerProduct) { - return avx512_fp16::inner_product_fp16_distance; + if (metric_type == MetricType::kSquaredEuclidean) { + return avx512_fp16::squared_euclidean_fp16_distance; } } @@ -258,6 +307,8 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, } } } +#endif + return nullptr; } diff --git a/tests/turbo/turbo_cosine_test.cc b/tests/turbo/turbo_cosine_test.cc index 77622afa6..f77b5e774 100644 --- a/tests/turbo/turbo_cosine_test.cc +++ b/tests/turbo/turbo_cosine_test.cc @@ -165,7 +165,7 @@ TEST(CosineMetric, TestFp16Cosine) { func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_scalar); - float epsilon = 0.01; + float epsilon = 0.2; ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon); ASSERT_NEAR(score_scalar, score_avx512, epsilon); ASSERT_NEAR(score_scalar, score_avx, epsilon); diff --git a/tests/turbo/turbo_euclidean_test.cc b/tests/turbo/turbo_euclidean_test.cc index 7a154ecc6..51f9bad49 100644 --- a/tests/turbo/turbo_euclidean_test.cc +++ b/tests/turbo/turbo_euclidean_test.cc @@ -142,7 +142,7 @@ TEST(SquaredEuclideanMetric, TestFp16SquaredEuclidean) { func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_scalar); - float epsilon = 0.01; + float epsilon = 0.2; ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon); ASSERT_NEAR(score_scalar, score_avx512, epsilon); ASSERT_NEAR(score_scalar, score_avx, epsilon); diff --git a/tests/turbo/turbo_inner_product_test.cc b/tests/turbo/turbo_inner_product_test.cc index 9b90675fe..ff0fa8144 100644 --- a/tests/turbo/turbo_inner_product_test.cc +++ b/tests/turbo/turbo_inner_product_test.cc @@ -142,7 +142,7 @@ TEST(InnerProductMetric, TestFp16InnerProduct) { func_scalar(doc_out.data(), query_out.data(), qmeta_reformer.dimension(), &score_scalar); - float epsilon = 0.01; + float epsilon = 0.2; ASSERT_NEAR(score_scalar, score_avx512fp16, epsilon); ASSERT_NEAR(score_scalar, score_avx512, epsilon); ASSERT_NEAR(score_scalar, score_avx, epsilon); From 1f2b66f6c927fa2b6bdb1204cd17898fab8f8a9a Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 13 Apr 2026 15:28:48 +0800 Subject: [PATCH 30/44] feat: support arm --- src/turbo/avx512/half_float/cosine.cc | 4 +- src/turbo/turbo.cc | 60 ++++++++++++++++++--------- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/src/turbo/avx512/half_float/cosine.cc b/src/turbo/avx512/half_float/cosine.cc index 84028f6dd..d123197f9 100644 --- a/src/turbo/avx512/half_float/cosine.cc +++ b/src/turbo/avx512/half_float/cosine.cc @@ -37,7 +37,7 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, (void)b; (void)dim; (void)distance; -#endif // __AVX__ +#endif // __AVX512F__ } void cosine_fp16_batch_distance(const void *const *vectors, const void *query, @@ -50,7 +50,7 @@ void cosine_fp16_batch_distance(const void *const *vectors, const void *query, (void)n; (void)dim; (void)distances; -#endif //__AVX__ +#endif //__AVX512F__ } } // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index d06b96b1e..4d0d26215 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -64,49 +64,69 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, #if defined(__ARM_NEON) // INT8 if (data_type == DataType::kInt8) { - if (metric_type == MetricType::kSquaredEuclidean) { - } + if (quantize_type == QuantizeType::kDefault) { + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_int8_distance; + } - if (metric_type == MetricType::kCosine) { - } + if (metric_type == MetricType::kCosine) { + return scalar::cosine_int8_distance; + } - if (metric_type == MetricType::kInnerProduct) { + if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_int8_distance; + } } } // INT$ if (data_type == DataType::kInt4) { - if (metric_type == MetricType::kSquaredEuclidean) { - } + if (quantize_type == QuantizeType::kDefault) { + if (metric_type == MetricType::kSquaredEuclidean) { + return scalar::squared_euclidean_int4_distance; + } - if (metric_type == MetricType::kCosine) { - } + if (metric_type == MetricType::kCosine) { + return scalar::cosine_int4_distance; + } - if (metric_type == MetricType::kInnerProduct) { + if (metric_type == MetricType::kInnerProduct) { + return scalar::inner_product_int4_distance; + } } } // FP32 if (data_type == DataType::kFp32) { - if (metric_type == MetricType::kSquaredEuclidean) { - } + if (quantize_type == QuantizeType::kDefault) { + if (metric_type == MetricType::kSquaredEuclidean) { + return armv8::squared_euclidean_fp32_distance; + } - if (metric_type == MetricType::kCosine) { - } + if (metric_type == MetricType::kCosine) { + return armv8::cosine_fp32_distance; + } - if (metric_type == MetricType::kInnerProduct) { + if (metric_type == MetricType::kInnerProduct) { + return armv8::inner_product_fp32_distance; + } } } // FP16 if (data_type == DataType::kFp16) { - if (metric_type == MetricType::kSquaredEuclidean) { - } + if (quantize_type == QuantizeType::kDefault) { + if (metric_type == MetricType::kSquaredEuclidean) { + return armv8::squared_euclidean_fp16_distance; + } - if (metric_type == MetricType::kCosine) { - } + if (metric_type == MetricType::kCosine) { + return armv8::cosine_fp16_distance; + } - if (metric_type == MetricType::kInnerProduct) { + if (metric_type == MetricType::kInnerProduct) { + return armv8::inner_product_fp16_distance; + } } } #else From 50fc6d70b7ea52388eb118397f86045a65d25359 Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 13 Apr 2026 15:46:17 +0800 Subject: [PATCH 31/44] feat: add armv8 --- src/turbo/armv8/half_float/cosine.cc | 56 +++++++++++ src/turbo/armv8/half_float/cosine.h | 30 ++++++ src/turbo/armv8/half_float/inner_product.cc | 54 +++++++++++ src/turbo/armv8/half_float/inner_product.h | 31 ++++++ .../armv8/half_float/inner_product_common.h | 95 +++++++++++++++++++ .../armv8/half_float/squared_euclidean.cc | 58 +++++++++++ .../armv8/half_float/squared_euclidean.h | 31 ++++++ .../half_float/squared_euclidean_common.h | 94 ++++++++++++++++++ 8 files changed, 449 insertions(+) create mode 100644 src/turbo/armv8/half_float/cosine.cc create mode 100644 src/turbo/armv8/half_float/cosine.h create mode 100644 src/turbo/armv8/half_float/inner_product.cc create mode 100644 src/turbo/armv8/half_float/inner_product.h create mode 100644 src/turbo/armv8/half_float/inner_product_common.h create mode 100644 src/turbo/armv8/half_float/squared_euclidean.cc create mode 100644 src/turbo/armv8/half_float/squared_euclidean.h create mode 100644 src/turbo/armv8/half_float/squared_euclidean_common.h diff --git a/src/turbo/armv8/half_float/cosine.cc b/src/turbo/armv8/half_float/cosine.cc new file mode 100644 index 000000000..d32a844ed --- /dev/null +++ b/src/turbo/armv8/half_float/cosine.cc @@ -0,0 +1,56 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "armv8/half_float/cosine.h" +#include "armv8/half_float/inner_product.h" +#include "armv8/half_float/inner_product_common.h" + +#if defined(__ARM_NEON) +#include +#endif + +namespace zvec::turbo::armv8 { + +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + constexpr size_t extra_dim = 2; + size_t original_dim = dim - extra_dim; + + float ip; + inner_product_fp32_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __ARM_NEON +} + +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__ARM_NEON) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__ARM_NEON +} + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/half_float/cosine.h b/src/turbo/armv8/half_float/cosine.h new file mode 100644 index 000000000..7d79f7bd7 --- /dev/null +++ b/src/turbo/armv8/half_float/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/half_float/inner_product.cc b/src/turbo/armv8/half_float/inner_product.cc new file mode 100644 index 000000000..a12479e7c --- /dev/null +++ b/src/turbo/armv8/half_float/inner_product.cc @@ -0,0 +1,54 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__ARM_NEON) +#include +#include +#include "armv8/half_float/inner_product.h" +#include "armv8/half_float/inner_product_common.h" + +using namespace zvec::turbo::avx512::internal; +#endif + +namespace zvec::turbo::avx512 { + +// Compute squared Euclidean distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + const zvec::ailego::Float16 *lhs = + reinterpret_cast(a); + const zvec::ailego::Float16 *rhs = + reinterpret_cast(b); + + ACCUM_FP16_1X1_NEON(lhs, rhs, dim, distance, 0ull, ) + +#endif +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/armv8/half_float/inner_product.h b/src/turbo/armv8/half_float/inner_product.h new file mode 100644 index 000000000..375315bce --- /dev/null +++ b/src/turbo/armv8/half_float/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute inner product distance between a single quantized FP16 +// vector pair. +void inner_product_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/half_float/inner_product_common.h b/src/turbo/armv8/half_float/inner_product_common.h new file mode 100644 index 000000000..5d077d2dc --- /dev/null +++ b/src/turbo/armv8/half_float/inner_product_common.h @@ -0,0 +1,95 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__ARM_NEON) +#include +#include +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::armv8::internal { + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, float16x8_t, v_sum, vdupq_n_f16(0)) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 3) << 3); \ + for (; q != qe_aligned; m += 8, q += 8) { \ + MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP16_STEP_NEON) \ + } \ + if (qe >= qe_aligned + 4) { \ + float16x8_t v_m = \ + vcombine_f16(vld1_f16((const float16_t *)m), \ + vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK)))); \ + float16x8_t v_q = \ + vcombine_f16(vld1_f16((const float16_t *)q), \ + vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK)))); \ + ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum_0_0) \ + m += 4; \ + q += 4; \ + } \ + float result = vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(v_sum_0_0)), \ + vcvt_high_f32_f16(v_sum_0_0))); \ + switch (qe - q) { \ + case 3: \ + ACCUM_FP16_STEP_GENERAL(m[2], q[2], result) \ + /* FALLTHRU */ \ + case 2: \ + ACCUM_FP16_STEP_GENERAL(m[1], q[1], result) \ + /* FALLTHRU */ \ + case 1: \ + ACCUM_FP16_STEP_GENERAL(m[0], q[0], result) \ + } \ + *out = _NORM(result); + +#else +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, float32x4_t, v_sum, vdupq_n_f32(0)) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 3) << 3); \ + for (; q != qe_aligned; m += 8, q += 8) { \ + MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP32_STEP_NEON) \ + } \ + if (qe >= qe_aligned + 4) { \ + float32x4_t v_m = vcvt_f32_f16(vld1_f16((const float16_t *)m)); \ + float32x4_t v_q = vcvt_f32_f16(vld1_f16((const float16_t *)q)); \ + ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum_0_0) \ + m += 4; \ + q += 4; \ + } \ + float result = vaddvq_f32(v_sum_0_0); \ + switch (qe - q) { \ + case 3: \ + ACCUM_FP16_STEP_GENERAL(m[2], q[2], result) \ + /* FALLTHRU */ \ + case 2: \ + ACCUM_FP16_STEP_GENERAL(m[1], q[1], result) \ + /* FALLTHRU */ \ + case 1: \ + ACCUM_FP16_STEP_GENERAL(m[0], q[0], result) \ + } \ + *out = _NORM(result); + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +} // namespace zvec::turbo::armv8::internal + +#endif // defined(__ARM_NEON) diff --git a/src/turbo/armv8/half_float/squared_euclidean.cc b/src/turbo/armv8/half_float/squared_euclidean.cc new file mode 100644 index 000000000..1f83ee713 --- /dev/null +++ b/src/turbo/armv8/half_float/squared_euclidean.cc @@ -0,0 +1,58 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__ARM_NEON) +#include +#include +#include "armv8/half_float/squared_euclidean.h" +#include "armv8/half_float/squared_euclidean_common.h" + +using namespace zvec::turbo::armv8::internal; +#endif + +namespace zvec::turbo::armv8 { + +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + const zvec::ailego::Float16 *lhs = + reinterpret_cast(a); + const zvec::ailego::Float16 *rhs = + reinterpret_cast(b); + + ACCUM_FP16_1X1_NEON(lhs, rhs, dim, &distance, 0ull, ) +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __ARM_NEON +} + +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__ARM_NEON) +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__ARM_NEON +} + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/half_float/squared_euclidean.h b/src/turbo/armv8/half_float/squared_euclidean.h new file mode 100644 index 000000000..01e8bcf78 --- /dev/null +++ b/src/turbo/armv8/half_float/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/half_float/squared_euclidean_common.h b/src/turbo/armv8/half_float/squared_euclidean_common.h new file mode 100644 index 000000000..b378f0ba6 --- /dev/null +++ b/src/turbo/armv8/half_float/squared_euclidean_common.h @@ -0,0 +1,94 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__ARM_NEON) +#include +#include +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::armv8::internal { + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, float16x8_t, v_sum, vdupq_n_f16(0)) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 3) << 3); \ + for (; q != qe_aligned; m += 8, q += 8) { \ + MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP16_STEP_NEON) \ + } \ + if (qe >= qe_aligned + 4) { \ + float16x8_t v_m = \ + vcombine_f16(vld1_f16((const float16_t *)m), \ + vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK)))); \ + float16x8_t v_q = \ + vcombine_f16(vld1_f16((const float16_t *)q), \ + vreinterpret_f16_u64(vdup_n_u64((uint64_t)(_MASK)))); \ + ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum_0_0) \ + m += 4; \ + q += 4; \ + } \ + float result = vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(v_sum_0_0)), \ + vcvt_high_f32_f16(v_sum_0_0))); \ + switch (qe - q) { \ + case 3: \ + ACCUM_FP16_STEP_GENERAL(m[2], q[2], result) \ + /* FALLTHRU */ \ + case 2: \ + ACCUM_FP16_STEP_GENERAL(m[1], q[1], result) \ + /* FALLTHRU */ \ + case 1: \ + ACCUM_FP16_STEP_GENERAL(m[0], q[0], result) \ + } \ + *out = _NORM(result); + +#else +//! Compute the distance between matrix and query (FP16, M=1, N=1) +#define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ + MATRIX_VAR_INIT(1, 1, float32x4_t, v_sum, vdupq_n_f32(0)) \ + const Float16 *qe = q + dim; \ + const Float16 *qe_aligned = q + ((dim >> 3) << 3); \ + for (; q != qe_aligned; m += 8, q += 8) { \ + MATRIX_FP16_ITER_1X1_NEON(m, q, v_sum, ACCUM_FP32_STEP_NEON) \ + } \ + if (qe >= qe_aligned + 4) { \ + float32x4_t v_m = vcvt_f32_f16(vld1_f16((const float16_t *)m)); \ + float32x4_t v_q = vcvt_f32_f16(vld1_f16((const float16_t *)q)); \ + ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum_0_0) \ + m += 4; \ + q += 4; \ + } \ + float result = vaddvq_f32(v_sum_0_0); \ + switch (qe - q) { \ + case 3: \ + ACCUM_FP16_STEP_GENERAL(m[2], q[2], result) \ + /* FALLTHRU */ \ + case 2: \ + ACCUM_FP16_STEP_GENERAL(m[1], q[1], result) \ + /* FALLTHRU */ \ + case 1: \ + ACCUM_FP16_STEP_GENERAL(m[0], q[0], result) \ + } \ + *out = _NORM(result); + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +} // namespace zvec::turbo::armv8::internal + +#endif // defined(__ARM_NEON) From b0bfa890065390b53a822f31e7838a8c374d46d0 Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 13 Apr 2026 15:58:34 +0800 Subject: [PATCH 32/44] feat: add armv8 --- src/turbo/armv8/half_float/cosine.cc | 4 ---- src/turbo/armv8/half_float/inner_product.h | 2 +- src/turbo/armv8/half_float/squared_euclidean.h | 4 ++-- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/turbo/armv8/half_float/cosine.cc b/src/turbo/armv8/half_float/cosine.cc index d32a844ed..e2eb5a6f7 100644 --- a/src/turbo/armv8/half_float/cosine.cc +++ b/src/turbo/armv8/half_float/cosine.cc @@ -16,10 +16,6 @@ #include "armv8/half_float/inner_product.h" #include "armv8/half_float/inner_product_common.h" -#if defined(__ARM_NEON) -#include -#endif - namespace zvec::turbo::armv8 { void cosine_fp32_distance(const void *a, const void *b, size_t dim, diff --git a/src/turbo/armv8/half_float/inner_product.h b/src/turbo/armv8/half_float/inner_product.h index 375315bce..cfd824459 100644 --- a/src/turbo/armv8/half_float/inner_product.h +++ b/src/turbo/armv8/half_float/inner_product.h @@ -23,7 +23,7 @@ namespace zvec::turbo::armv8 { void inner_product_fp16_distance(const void *a, const void *b, size_t dim, float *distance); -// Batch version of inner_product_fp32_distance. +// Batch version of inner_product_fp16_distance. void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); diff --git a/src/turbo/armv8/half_float/squared_euclidean.h b/src/turbo/armv8/half_float/squared_euclidean.h index 01e8bcf78..5a540b590 100644 --- a/src/turbo/armv8/half_float/squared_euclidean.h +++ b/src/turbo/armv8/half_float/squared_euclidean.h @@ -18,12 +18,12 @@ namespace zvec::turbo::armv8 { -// Compute squared euclidean distance between a single quantized FP32 +// Compute squared euclidean distance between a single quantized FP16 // vector pair. void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, float *distance); -// Batch version of squared euclidean FP32. +// Batch version of squared euclidean FP16. void squared_euclidean_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); From ebd51efafcabf8812033cc882524b9d59011563d Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 13 Apr 2026 16:11:21 +0800 Subject: [PATCH 33/44] feat: add armv8 --- src/turbo/armv8/float32/cosine.cc | 56 +++++++++++++++++ src/turbo/armv8/float32/cosine.h | 30 +++++++++ src/turbo/armv8/float32/inner_product.cc | 52 ++++++++++++++++ src/turbo/armv8/float32/inner_product.h | 31 ++++++++++ .../armv8/float32/inner_product_common.h | 58 +++++++++++++++++ src/turbo/armv8/float32/squared_euclidean.cc | 56 +++++++++++++++++ src/turbo/armv8/float32/squared_euclidean.h | 31 ++++++++++ .../armv8/float32/squared_euclidean_common.h | 62 +++++++++++++++++++ 8 files changed, 376 insertions(+) create mode 100644 src/turbo/armv8/float32/cosine.cc create mode 100644 src/turbo/armv8/float32/cosine.h create mode 100644 src/turbo/armv8/float32/inner_product.cc create mode 100644 src/turbo/armv8/float32/inner_product.h create mode 100644 src/turbo/armv8/float32/inner_product_common.h create mode 100644 src/turbo/armv8/float32/squared_euclidean.cc create mode 100644 src/turbo/armv8/float32/squared_euclidean.h create mode 100644 src/turbo/armv8/float32/squared_euclidean_common.h diff --git a/src/turbo/armv8/float32/cosine.cc b/src/turbo/armv8/float32/cosine.cc new file mode 100644 index 000000000..d32a844ed --- /dev/null +++ b/src/turbo/armv8/float32/cosine.cc @@ -0,0 +1,56 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "armv8/half_float/cosine.h" +#include "armv8/half_float/inner_product.h" +#include "armv8/half_float/inner_product_common.h" + +#if defined(__ARM_NEON) +#include +#endif + +namespace zvec::turbo::armv8 { + +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + constexpr size_t extra_dim = 2; + size_t original_dim = dim - extra_dim; + + float ip; + inner_product_fp32_distance(a, b, original_dim, &ip); + + *distance = 1 - ip; +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __ARM_NEON +} + +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances) { +#if defined(__ARM_NEON) + +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__ARM_NEON +} + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/float32/cosine.h b/src/turbo/armv8/float32/cosine.h new file mode 100644 index 000000000..529e11ef3 --- /dev/null +++ b/src/turbo/armv8/float32/cosine.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute cosine distance (negative inner product after normalization) between +// a single quantized FP32 vector pair. +void cosine_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of cosine_fp32_distance. +void cosine_fp32_batch_distance(const void *const *vectors, const void *query, + size_t n, size_t dim, float *distances); + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/float32/inner_product.cc b/src/turbo/armv8/float32/inner_product.cc new file mode 100644 index 000000000..695d06abc --- /dev/null +++ b/src/turbo/armv8/float32/inner_product.cc @@ -0,0 +1,52 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__ARM_NEON) +#include +#include +#include "armv8/float32/inner_product.h" +#include "armv8/float32/inner_product_common.h" + +using namespace zvec::turbo::ar::internal; +#endif + +namespace zvec::turbo::armv8 { + +// Compute squared Euclidean distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + inner_product_fp32_armv8(lhs, rhs, dim, distance, 0ull, ) + +#endif +} + +// Batch version of inner_product_fp16_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/float32/inner_product.h b/src/turbo/armv8/float32/inner_product.h new file mode 100644 index 000000000..a1d8b612f --- /dev/null +++ b/src/turbo/armv8/float32/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute inner product distance between a single quantized FP32 +// vector pair. +void inner_product_fp32_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_fp32_distance. +void inner_product_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/float32/inner_product_common.h b/src/turbo/armv8/float32/inner_product_common.h new file mode 100644 index 000000000..10bab65b4 --- /dev/null +++ b/src/turbo/armv8/float32/inner_product_common.h @@ -0,0 +1,58 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__ARM_NEON) +#include +#include +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::armv8::internal { + +static __attribute__((always_inline)) void inner_product_fp32_armv8( + const float *last = lhs + size; + const float *last_aligned = lhs + ((size >> 3) << 3); + + float32x4_t v_sum_0 = vdupq_n_f32(0); + float32x4_t v_sum_1 = vdupq_n_f32(0); + + for (; lhs != last_aligned; lhs += 8, rhs += 8) { + v_sum_0 = vfmaq_f32(v_sum_0, vld1q_f32(lhs + 0), vld1q_f32(rhs + 0)); + v_sum_1 = vfmaq_f32(v_sum_1, vld1q_f32(lhs + 4), vld1q_f32(rhs + 4)); + } + if (last >= last_aligned + 4) { + v_sum_0 = vfmaq_f32(v_sum_0, vld1q_f32(lhs), vld1q_f32(rhs)); + lhs += 4; + rhs += 4; + } + + float result = vaddvq_f32(vaddq_f32(v_sum_0, v_sum_1)); + switch (last - lhs) { + case 3: + FMA_FP32_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(lhs[0], rhs[0], result) + } + return result; +} // namespace zvec::turbo::armv8::internal + +#endif // defined(__ARM_NEON) diff --git a/src/turbo/armv8/float32/squared_euclidean.cc b/src/turbo/armv8/float32/squared_euclidean.cc new file mode 100644 index 000000000..31e04e085 --- /dev/null +++ b/src/turbo/armv8/float32/squared_euclidean.cc @@ -0,0 +1,56 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(__ARM_NEON) +#include +#include +#include "armv8/half_float/squared_euclidean.h" +#include "armv8/half_float/squared_euclidean_common.h" + +using namespace zvec::turbo::armv8::internal; +#endif + +namespace zvec::turbo::armv8 { + +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, + float *distance) { +#if defined(__ARM_NEON) + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + squared_euclidean_fp32_armv8(lhs, rhs, dim, distance, 0ull, ) +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif // __ARM_NEON +} + +void squared_euclidean_fp32_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances) { +#if defined(__ARM_NEON) +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif //__ARM_NEON +} + +} // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/float32/squared_euclidean.h b/src/turbo/armv8/float32/squared_euclidean.h new file mode 100644 index 000000000..01e8bcf78 --- /dev/null +++ b/src/turbo/armv8/float32/squared_euclidean.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::armv8 { + +// Compute squared euclidean distance between a single quantized FP32 +// vector pair. +void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of squared euclidean FP32. +void squared_euclidean_fp16_batch_distance(const void *const *vectors, + const void *query, size_t n, + size_t dim, float *distances); + +} // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/float32/squared_euclidean_common.h b/src/turbo/armv8/float32/squared_euclidean_common.h new file mode 100644 index 000000000..730444e84 --- /dev/null +++ b/src/turbo/armv8/float32/squared_euclidean_common.h @@ -0,0 +1,62 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__ARM_NEON) +#include +#include +#include +#include + +using namespace zvec::ailego; + +namespace zvec::turbo::armv8::internal { + +static __attribute__((always_inline)) void squared_euclidean_fp_armv8( + const float *last = lhs + size; + const float *last_aligned = lhs + ((size >> 3) << 3); + + float32x4_t v_sum_0 = vdupq_n_f32(0); + float32x4_t v_sum_1 = vdupq_n_f32(0); + + for (; lhs != last_aligned; lhs += 8, rhs += 8) { + float32x4_t v_d_0 = vsubq_f32(vld1q_f32(lhs + 0), vld1q_f32(rhs + 0)); + float32x4_t v_d_1 = vsubq_f32(vld1q_f32(lhs + 4), vld1q_f32(rhs + 4)); + v_sum_0 = vfmaq_f32(v_sum_0, v_d_0, v_d_0); + v_sum_1 = vfmaq_f32(v_sum_1, v_d_1, v_d_1); + } + if (last >= last_aligned + 4) { + float32x4_t v_d = vsubq_f32(vld1q_f32(lhs), vld1q_f32(rhs)); + v_sum_0 = vfmaq_f32(v_sum_0, v_d, v_d); + lhs += 4; + rhs += 4; + } + + float result = vaddvq_f32(vaddq_f32(v_sum_0, v_sum_1)); + switch (last - lhs) { + case 3: + SSD_FP32_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + SSD_FP32_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + SSD_FP32_GENERAL(lhs[0], rhs[0], result) + } + *out = result; + +} // namespace zvec::turbo::armv8::internal + +#endif // defined(__ARM_NEON) From fe8d72a5b64f33f756051c6deb76f4d5065da0b0 Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 13 Apr 2026 16:39:34 +0800 Subject: [PATCH 34/44] fix: armv8 --- src/turbo/CMakeLists.txt | 13 +++++ src/turbo/armv8/float32/cosine.cc | 10 ++-- .../armv8/float32/inner_product_common.h | 14 +++++- src/turbo/armv8/float32/squared_euclidean.h | 4 +- .../armv8/float32/squared_euclidean_common.h | 9 +++- src/turbo/armv8/half_float/cosine.cc | 6 +-- src/turbo/armv8/half_float/inner_product.cc | 6 +-- .../armv8/half_float/inner_product_common.h | 37 ++++++++++++++ .../armv8/half_float/squared_euclidean.cc | 2 +- .../half_float/squared_euclidean_common.h | 49 +++++++++++++++++++ src/turbo/avx/float32/common.h | 8 --- .../avx/half_float/inner_product_common.h | 8 --- .../avx/half_float/squared_euclidean_common.h | 8 --- src/turbo/avx2/half_float_converter/common.h | 8 --- .../inner_product_common.h | 8 --- .../inner_product_common.h | 8 --- .../squared_euclidean_common.h | 8 --- src/turbo/avx512/float32/common.h | 8 --- .../avx512/half_float/inner_product_common.h | 8 --- .../half_float/squared_euclidean_common.h | 8 --- .../half_float/inner_product_common.h | 8 --- .../half_float/squared_euclidean_common.h | 8 --- .../scalar/record_quantized_int4/common.h | 8 --- .../scalar/record_quantized_int8/common.h | 8 --- src/turbo/sse/record_quantized_int4/common.h | 8 --- src/turbo/sse/record_quantized_int8/common.h | 8 --- src/turbo/turbo.cc | 6 +++ 27 files changed, 136 insertions(+), 148 deletions(-) diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index eae831309..e51f72b1a 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -65,6 +65,19 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH) PROPERTIES COMPILE_FLAGS "${TURBO_MARCH_FLAG_SSE}" ) + elseif (HOST_ARCH MATCHES "^(arm|arm64)$") + set(TURBO_MARCH_FLAG_NEON "-march=armv8-a") + + file(GLOB_RECURSE NEON_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/armv8/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/armv8/*.c + ) + + set_source_files_properties( + ${NEON_SRCS} + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_NEON}" + ) endif() endif() diff --git a/src/turbo/armv8/float32/cosine.cc b/src/turbo/armv8/float32/cosine.cc index d32a844ed..0d5e7b79d 100644 --- a/src/turbo/armv8/float32/cosine.cc +++ b/src/turbo/armv8/float32/cosine.cc @@ -12,13 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "armv8/half_float/cosine.h" -#include "armv8/half_float/inner_product.h" -#include "armv8/half_float/inner_product_common.h" - -#if defined(__ARM_NEON) -#include -#endif +#include "armv8/float32/cosine.h" +#include "armv8/float32/inner_product.h" +#include "armv8/float32/inner_product_common.h" namespace zvec::turbo::armv8 { diff --git a/src/turbo/armv8/float32/inner_product_common.h b/src/turbo/armv8/float32/inner_product_common.h index 10bab65b4..a9a045dc3 100644 --- a/src/turbo/armv8/float32/inner_product_common.h +++ b/src/turbo/armv8/float32/inner_product_common.h @@ -22,9 +22,17 @@ using namespace zvec::ailego; +//! Calculate Fused-Multiply-Add (GENERAL) +#define FMA_FP32_GENERAL(m, q, sum) sum += (m * q); + namespace zvec::turbo::armv8::internal { -static __attribute__((always_inline)) void inner_product_fp32_armv8( +static __attribute__((always_inline)) void inner_product_fp32_armv8(const void *a, + const void *b, size_t size, + float *distance) { + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + const float *last = lhs + size; const float *last_aligned = lhs + ((size >> 3) << 3); @@ -52,7 +60,9 @@ static __attribute__((always_inline)) void inner_product_fp32_armv8( case 1: FMA_FP32_GENERAL(lhs[0], rhs[0], result) } - return result; + *distance = result; +} + } // namespace zvec::turbo::armv8::internal #endif // defined(__ARM_NEON) diff --git a/src/turbo/armv8/float32/squared_euclidean.h b/src/turbo/armv8/float32/squared_euclidean.h index 01e8bcf78..3df75f17a 100644 --- a/src/turbo/armv8/float32/squared_euclidean.h +++ b/src/turbo/armv8/float32/squared_euclidean.h @@ -20,11 +20,11 @@ namespace zvec::turbo::armv8 { // Compute squared euclidean distance between a single quantized FP32 // vector pair. -void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, +void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, float *distance); // Batch version of squared euclidean FP32. -void squared_euclidean_fp16_batch_distance(const void *const *vectors, +void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances); diff --git a/src/turbo/armv8/float32/squared_euclidean_common.h b/src/turbo/armv8/float32/squared_euclidean_common.h index 730444e84..459b2d58d 100644 --- a/src/turbo/armv8/float32/squared_euclidean_common.h +++ b/src/turbo/armv8/float32/squared_euclidean_common.h @@ -24,8 +24,13 @@ using namespace zvec::ailego; namespace zvec::turbo::armv8::internal { -static __attribute__((always_inline)) void squared_euclidean_fp_armv8( - const float *last = lhs + size; +static __attribute__((always_inline)) void squared_euclidean_fp_armv8(const void *a, + const void *b, size_t size, + float *distance) { + const float *lhs = reinterpret_cast(a); + const float *rhs = reinterpret_cast(b); + + const float *last = lhs + size; const float *last_aligned = lhs + ((size >> 3) << 3); float32x4_t v_sum_0 = vdupq_n_f32(0); diff --git a/src/turbo/armv8/half_float/cosine.cc b/src/turbo/armv8/half_float/cosine.cc index e2eb5a6f7..91792b03f 100644 --- a/src/turbo/armv8/half_float/cosine.cc +++ b/src/turbo/armv8/half_float/cosine.cc @@ -18,14 +18,14 @@ namespace zvec::turbo::armv8 { -void cosine_fp32_distance(const void *a, const void *b, size_t dim, +void cosine_fp16_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__ARM_NEON) constexpr size_t extra_dim = 2; size_t original_dim = dim - extra_dim; float ip; - inner_product_fp32_distance(a, b, original_dim, &ip); + inner_product_fp16_distance(a, b, original_dim, &ip); *distance = 1 - ip; #else @@ -36,7 +36,7 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, #endif // __ARM_NEON } -void cosine_fp32_batch_distance(const void *const *vectors, const void *query, +void cosine_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__ARM_NEON) diff --git a/src/turbo/armv8/half_float/inner_product.cc b/src/turbo/armv8/half_float/inner_product.cc index a12479e7c..03831a986 100644 --- a/src/turbo/armv8/half_float/inner_product.cc +++ b/src/turbo/armv8/half_float/inner_product.cc @@ -20,10 +20,10 @@ #include "armv8/half_float/inner_product.h" #include "armv8/half_float/inner_product_common.h" -using namespace zvec::turbo::avx512::internal; +using namespace zvec::turbo::armv8::internal; #endif -namespace zvec::turbo::avx512 { +namespace zvec::turbo::armv8 { // Compute squared Euclidean distance between a single quantized FP16 // vector pair. @@ -51,4 +51,4 @@ void inner_product_fp16_batch_distance(const void *const *vectors, (void)distances; } -} // namespace zvec::turbo::avx512 \ No newline at end of file +} // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/half_float/inner_product_common.h b/src/turbo/armv8/half_float/inner_product_common.h index 5d077d2dc..1ac007d07 100644 --- a/src/turbo/armv8/half_float/inner_product_common.h +++ b/src/turbo/armv8/half_float/inner_product_common.h @@ -24,8 +24,28 @@ using namespace zvec::ailego; namespace zvec::turbo::armv8::internal { +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + +//! Scalar fused multiply-add for inner product (FP16 general) +#define ACCUM_FP16_STEP_GENERAL(m, q, sum) sum += (m * q); + #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +//! NEON fused multiply-add for inner product (FP16) +#define ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum) v_sum = vfmaq_f16(v_sum, v_m, v_q); + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC) \ + { \ + float16x8_t v_m = vld1q_f16((const float16_t *)m); \ + float16x8_t v_q = vld1q_f16((const float16_t *)q); \ + _PROC(v_m, v_q, _RES##_0_0) \ + } + //! Compute the distance between matrix and query (FP16, M=1, N=1) #define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ MATRIX_VAR_INIT(1, 1, float16x8_t, v_sum, vdupq_n_f16(0)) \ @@ -60,6 +80,23 @@ namespace zvec::turbo::armv8::internal { *out = _NORM(result); #else + +//! NEON fused multiply-add for inner product (FP32) +#define ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum) v_sum = vfmaq_f32(v_sum, v_m, v_q); + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC) \ + { \ + float16x8_t v_m = vld1q_f16((const float16_t *)m); \ + float16x8_t v_q = vld1q_f16((const float16_t *)q); \ + float32x4_t v_m_0 = vcvt_f32_f16(vget_low_f16(v_m)); \ + float32x4_t v_q_0 = vcvt_f32_f16(vget_low_f16(v_q)); \ + _PROC(v_m_0, v_q_0, _RES##_0_0) \ + v_m_0 = vcvt_high_f32_f16(v_m); \ + v_q_0 = vcvt_high_f32_f16(v_q); \ + _PROC(v_m_0, v_q_0, _RES##_0_0) \ + } + //! Compute the distance between matrix and query (FP16, M=1, N=1) #define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ MATRIX_VAR_INIT(1, 1, float32x4_t, v_sum, vdupq_n_f32(0)) \ diff --git a/src/turbo/armv8/half_float/squared_euclidean.cc b/src/turbo/armv8/half_float/squared_euclidean.cc index 1f83ee713..8f197cad9 100644 --- a/src/turbo/armv8/half_float/squared_euclidean.cc +++ b/src/turbo/armv8/half_float/squared_euclidean.cc @@ -33,7 +33,7 @@ void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, const zvec::ailego::Float16 *rhs = reinterpret_cast(b); - ACCUM_FP16_1X1_NEON(lhs, rhs, dim, &distance, 0ull, ) + ACCUM_FP16_1X1_NEON(lhs, rhs, dim, distance, 0ull, ) #else (void)a; (void)b; diff --git a/src/turbo/armv8/half_float/squared_euclidean_common.h b/src/turbo/armv8/half_float/squared_euclidean_common.h index b378f0ba6..382c58994 100644 --- a/src/turbo/armv8/half_float/squared_euclidean_common.h +++ b/src/turbo/armv8/half_float/squared_euclidean_common.h @@ -24,7 +24,35 @@ using namespace zvec::ailego; namespace zvec::turbo::armv8::internal { +#define MATRIX_VAR_INIT_1X1(_VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + _VAR_TYPE _VAR_NAME##_0_0 = (_VAR_INIT); + +#define MATRIX_VAR_INIT(_M, _N, _VAR_TYPE, _VAR_NAME, _VAR_INIT) \ + MATRIX_VAR_INIT_##_M##X##_N(_VAR_TYPE, _VAR_NAME, _VAR_INIT) + +//! Scalar sum of squared difference (FP16 general) +#define ACCUM_FP16_STEP_GENERAL(m, q, sum) \ + { \ + float x = m - q; \ + sum += (x * x); \ + } + #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +//! NEON sum of squared difference (FP16) +#define ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum) \ + { \ + float16x8_t v_d = vsubq_f16(v_m, v_q); \ + v_sum = vfmaq_f16(v_sum, v_d, v_d); \ + } + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC) \ + { \ + float16x8_t v_m = vld1q_f16((const float16_t *)m); \ + float16x8_t v_q = vld1q_f16((const float16_t *)q); \ + _PROC(v_m, v_q, _RES##_0_0) \ + } //! Compute the distance between matrix and query (FP16, M=1, N=1) #define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ MATRIX_VAR_INIT(1, 1, float16x8_t, v_sum, vdupq_n_f16(0)) \ @@ -59,6 +87,27 @@ namespace zvec::turbo::armv8::internal { *out = _NORM(result); #else + +//! NEON sum of squared difference (FP32) +#define ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum) \ + { \ + float32x4_t v_d = vsubq_f32(v_m, v_q); \ + v_sum = vfmaq_f32(v_sum, v_d, v_d); \ + } + +//! Iterative process of computing distance (FP16, M=1, N=1) +#define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC) \ + { \ + float16x8_t v_m = vld1q_f16((const float16_t *)m); \ + float16x8_t v_q = vld1q_f16((const float16_t *)q); \ + float32x4_t v_m_0 = vcvt_f32_f16(vget_low_f16(v_m)); \ + float32x4_t v_q_0 = vcvt_f32_f16(vget_low_f16(v_q)); \ + _PROC(v_m_0, v_q_0, _RES##_0_0) \ + v_m_0 = vcvt_high_f32_f16(v_m); \ + v_q_0 = vcvt_high_f32_f16(v_q); \ + _PROC(v_m_0, v_q_0, _RES##_0_0) \ + } + //! Compute the distance between matrix and query (FP16, M=1, N=1) #define ACCUM_FP16_1X1_NEON(m, q, dim, out, _MASK, _NORM) \ MATRIX_VAR_INIT(1, 1, float32x4_t, v_sum, vdupq_n_f32(0)) \ diff --git a/src/turbo/avx/float32/common.h b/src/turbo/avx/float32/common.h index 6d3f91d12..cb22033cc 100644 --- a/src/turbo/avx/float32/common.h +++ b/src/turbo/avx/float32/common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX__) diff --git a/src/turbo/avx/half_float/inner_product_common.h b/src/turbo/avx/half_float/inner_product_common.h index 51af98f28..a6816d022 100644 --- a/src/turbo/avx/half_float/inner_product_common.h +++ b/src/turbo/avx/half_float/inner_product_common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX__) diff --git a/src/turbo/avx/half_float/squared_euclidean_common.h b/src/turbo/avx/half_float/squared_euclidean_common.h index edc5252af..8e58393d7 100644 --- a/src/turbo/avx/half_float/squared_euclidean_common.h +++ b/src/turbo/avx/half_float/squared_euclidean_common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX__) diff --git a/src/turbo/avx2/half_float_converter/common.h b/src/turbo/avx2/half_float_converter/common.h index 4f11cc2a9..1b05591e8 100644 --- a/src/turbo/avx2/half_float_converter/common.h +++ b/src/turbo/avx2/half_float_converter/common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX2__) diff --git a/src/turbo/avx2/record_quantized_int4/inner_product_common.h b/src/turbo/avx2/record_quantized_int4/inner_product_common.h index 6d12504e3..8c96f5fb0 100644 --- a/src/turbo/avx2/record_quantized_int4/inner_product_common.h +++ b/src/turbo/avx2/record_quantized_int4/inner_product_common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX2__) diff --git a/src/turbo/avx2/record_quantized_int8/inner_product_common.h b/src/turbo/avx2/record_quantized_int8/inner_product_common.h index e49b36dd3..0176f277a 100644 --- a/src/turbo/avx2/record_quantized_int8/inner_product_common.h +++ b/src/turbo/avx2/record_quantized_int8/inner_product_common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX2__) diff --git a/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h b/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h index b352108ed..e460ade68 100644 --- a/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h +++ b/src/turbo/avx2/record_quantized_int8/squared_euclidean_common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX2__) diff --git a/src/turbo/avx512/float32/common.h b/src/turbo/avx512/float32/common.h index 36111ab18..af04d0e41 100644 --- a/src/turbo/avx512/float32/common.h +++ b/src/turbo/avx512/float32/common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX512F__) diff --git a/src/turbo/avx512/half_float/inner_product_common.h b/src/turbo/avx512/half_float/inner_product_common.h index 4f36ee1e8..dcd6f2a83 100644 --- a/src/turbo/avx512/half_float/inner_product_common.h +++ b/src/turbo/avx512/half_float/inner_product_common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX512F__) diff --git a/src/turbo/avx512/half_float/squared_euclidean_common.h b/src/turbo/avx512/half_float/squared_euclidean_common.h index d05842495..6ff8c4254 100644 --- a/src/turbo/avx512/half_float/squared_euclidean_common.h +++ b/src/turbo/avx512/half_float/squared_euclidean_common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX512F__) diff --git a/src/turbo/avx512_fp16/half_float/inner_product_common.h b/src/turbo/avx512_fp16/half_float/inner_product_common.h index 50c9e8053..30921e038 100644 --- a/src/turbo/avx512_fp16/half_float/inner_product_common.h +++ b/src/turbo/avx512_fp16/half_float/inner_product_common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX512FP16__) diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h b/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h index c769b067f..b5f91988e 100644 --- a/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean_common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__AVX512FP16__) diff --git a/src/turbo/scalar/record_quantized_int4/common.h b/src/turbo/scalar/record_quantized_int4/common.h index 4257a66ed..f4b74d7d3 100644 --- a/src/turbo/scalar/record_quantized_int4/common.h +++ b/src/turbo/scalar/record_quantized_int4/common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #include diff --git a/src/turbo/scalar/record_quantized_int8/common.h b/src/turbo/scalar/record_quantized_int8/common.h index 92ab3736d..d0b7186ae 100644 --- a/src/turbo/scalar/record_quantized_int8/common.h +++ b/src/turbo/scalar/record_quantized_int8/common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #include diff --git a/src/turbo/sse/record_quantized_int4/common.h b/src/turbo/sse/record_quantized_int4/common.h index 66ba30fa0..623d6365a 100644 --- a/src/turbo/sse/record_quantized_int4/common.h +++ b/src/turbo/sse/record_quantized_int4/common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__SSE4_1__) diff --git a/src/turbo/sse/record_quantized_int8/common.h b/src/turbo/sse/record_quantized_int8/common.h index 1f44d04ab..b48b2598e 100644 --- a/src/turbo/sse/record_quantized_int8/common.h +++ b/src/turbo/sse/record_quantized_int8/common.h @@ -12,14 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Shared AVX512-VNNI inner product kernels for record_quantized_int8 distance -// implementations (cosine, l2, mips_l2, etc.). -// -// All functions are marked always_inline so that when this header is included -// from a per-file-march .cc translation unit, the compiler can fully inline -// and optimize them under the correct -march flag without any cross-TU call -// overhead. - #pragma once #if defined(__SSE__) diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index 4d0d26215..bb9067851 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -55,6 +55,12 @@ #include "sse/record_quantized_int8/cosine.h" #include "sse/record_quantized_int8/inner_product.h" #include "sse/record_quantized_int8/squared_euclidean.h" +#include "armv8/float32/cosine.h" +#include "armv8/float32/inner_product.h" +#include "armv8/float32/squared_euclidean.h" +#include "armv8/half_float/cosine.h" +#include "armv8/half_float/inner_product.h" +#include "armv8/half_float/squared_euclidean.h" namespace zvec::turbo { From f29d6dd3cfe8df13d91011a268639b8cde5c285d Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 13 Apr 2026 16:41:58 +0800 Subject: [PATCH 35/44] fix: fix typo --- src/turbo/armv8/float32/inner_product.cc | 8 ++--- src/turbo/armv8/float32/squared_euclidean.cc | 9 ++--- .../armv8/float32/squared_euclidean_common.h | 33 +++++++++++-------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/turbo/armv8/float32/inner_product.cc b/src/turbo/armv8/float32/inner_product.cc index 695d06abc..dbc5a3048 100644 --- a/src/turbo/armv8/float32/inner_product.cc +++ b/src/turbo/armv8/float32/inner_product.cc @@ -20,7 +20,7 @@ #include "armv8/float32/inner_product.h" #include "armv8/float32/inner_product_common.h" -using namespace zvec::turbo::ar::internal; +using namespace zvec::turbo::armv8::internal; #endif namespace zvec::turbo::armv8 { @@ -30,11 +30,7 @@ namespace zvec::turbo::armv8 { void inner_product_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__ARM_NEON) - const float *lhs = reinterpret_cast(a); - const float *rhs = reinterpret_cast(b); - - inner_product_fp32_armv8(lhs, rhs, dim, distance, 0ull, ) - + inner_product_fp32_armv8(a, b, dim, distance); #endif } diff --git a/src/turbo/armv8/float32/squared_euclidean.cc b/src/turbo/armv8/float32/squared_euclidean.cc index 31e04e085..a2803d9ae 100644 --- a/src/turbo/armv8/float32/squared_euclidean.cc +++ b/src/turbo/armv8/float32/squared_euclidean.cc @@ -17,8 +17,8 @@ #if defined(__ARM_NEON) #include #include -#include "armv8/half_float/squared_euclidean.h" -#include "armv8/half_float/squared_euclidean_common.h" +#include "armv8/float32/squared_euclidean.h" +#include "armv8/float32/squared_euclidean_common.h" using namespace zvec::turbo::armv8::internal; #endif @@ -28,10 +28,7 @@ namespace zvec::turbo::armv8 { void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, float *distance) { #if defined(__ARM_NEON) - const float *lhs = reinterpret_cast(a); - const float *rhs = reinterpret_cast(b); - - squared_euclidean_fp32_armv8(lhs, rhs, dim, distance, 0ull, ) + squared_euclidean_fp32_armv8(a, b, dim, distance); #else (void)a; (void)b; diff --git a/src/turbo/armv8/float32/squared_euclidean_common.h b/src/turbo/armv8/float32/squared_euclidean_common.h index 459b2d58d..a1dd4643d 100644 --- a/src/turbo/armv8/float32/squared_euclidean_common.h +++ b/src/turbo/armv8/float32/squared_euclidean_common.h @@ -22,14 +22,20 @@ using namespace zvec::ailego; +//! Calculate Sum-of-Squared-Differences (GENERAL) +#define SSD_FP32_GENERAL(m, q, sum) \ + { \ + float x = m - q; \ + sum += (x * x); \ + } + namespace zvec::turbo::armv8::internal { -static __attribute__((always_inline)) void squared_euclidean_fp_armv8(const void *a, - const void *b, size_t size, - float *distance) { +static __attribute__((always_inline)) void squared_euclidean_fp32_armv8( + const void *a, const void *b, size_t size, float *distance) { const float *lhs = reinterpret_cast(a); const float *rhs = reinterpret_cast(b); - + const float *last = lhs + size; const float *last_aligned = lhs + ((size >> 3) << 3); @@ -37,16 +43,16 @@ static __attribute__((always_inline)) void squared_euclidean_fp_armv8(const void float32x4_t v_sum_1 = vdupq_n_f32(0); for (; lhs != last_aligned; lhs += 8, rhs += 8) { - float32x4_t v_d_0 = vsubq_f32(vld1q_f32(lhs + 0), vld1q_f32(rhs + 0)); - float32x4_t v_d_1 = vsubq_f32(vld1q_f32(lhs + 4), vld1q_f32(rhs + 4)); - v_sum_0 = vfmaq_f32(v_sum_0, v_d_0, v_d_0); - v_sum_1 = vfmaq_f32(v_sum_1, v_d_1, v_d_1); + float32x4_t v_d_0 = vsubq_f32(vld1q_f32(lhs + 0), vld1q_f32(rhs + 0)); + float32x4_t v_d_1 = vsubq_f32(vld1q_f32(lhs + 4), vld1q_f32(rhs + 4)); + v_sum_0 = vfmaq_f32(v_sum_0, v_d_0, v_d_0); + v_sum_1 = vfmaq_f32(v_sum_1, v_d_1, v_d_1); } if (last >= last_aligned + 4) { - float32x4_t v_d = vsubq_f32(vld1q_f32(lhs), vld1q_f32(rhs)); - v_sum_0 = vfmaq_f32(v_sum_0, v_d, v_d); - lhs += 4; - rhs += 4; + float32x4_t v_d = vsubq_f32(vld1q_f32(lhs), vld1q_f32(rhs)); + v_sum_0 = vfmaq_f32(v_sum_0, v_d, v_d); + lhs += 4; + rhs += 4; } float result = vaddvq_f32(vaddq_f32(v_sum_0, v_sum_1)); @@ -60,7 +66,8 @@ static __attribute__((always_inline)) void squared_euclidean_fp_armv8(const void case 1: SSD_FP32_GENERAL(lhs[0], rhs[0], result) } - *out = result; + *distance = result; +} } // namespace zvec::turbo::armv8::internal From 53ffc8e984011f9a34d1a23658c77b78fa80db98 Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 13 Apr 2026 17:13:19 +0800 Subject: [PATCH 36/44] fix: fix dist --- src/turbo/armv8/float32/cosine.cc | 2 +- .../armv8/float32/inner_product_common.h | 33 +++++++++---------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/turbo/armv8/float32/cosine.cc b/src/turbo/armv8/float32/cosine.cc index 0d5e7b79d..83d3c717b 100644 --- a/src/turbo/armv8/float32/cosine.cc +++ b/src/turbo/armv8/float32/cosine.cc @@ -27,7 +27,7 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, float ip; inner_product_fp32_distance(a, b, original_dim, &ip); - *distance = 1 - ip; + *distance = 1 + ip; #else (void)a; (void)b; diff --git a/src/turbo/armv8/float32/inner_product_common.h b/src/turbo/armv8/float32/inner_product_common.h index a9a045dc3..fe75269ed 100644 --- a/src/turbo/armv8/float32/inner_product_common.h +++ b/src/turbo/armv8/float32/inner_product_common.h @@ -27,9 +27,8 @@ using namespace zvec::ailego; namespace zvec::turbo::armv8::internal { -static __attribute__((always_inline)) void inner_product_fp32_armv8(const void *a, - const void *b, size_t size, - float *distance) { +static __attribute__((always_inline)) void inner_product_fp32_armv8( + const void *a, const void *b, size_t size, float *distance) { const float *lhs = reinterpret_cast(a); const float *rhs = reinterpret_cast(b); @@ -40,27 +39,27 @@ static __attribute__((always_inline)) void inner_product_fp32_armv8(const void * float32x4_t v_sum_1 = vdupq_n_f32(0); for (; lhs != last_aligned; lhs += 8, rhs += 8) { - v_sum_0 = vfmaq_f32(v_sum_0, vld1q_f32(lhs + 0), vld1q_f32(rhs + 0)); - v_sum_1 = vfmaq_f32(v_sum_1, vld1q_f32(lhs + 4), vld1q_f32(rhs + 4)); + v_sum_0 = vfmaq_f32(v_sum_0, vld1q_f32(lhs + 0), vld1q_f32(rhs + 0)); + v_sum_1 = vfmaq_f32(v_sum_1, vld1q_f32(lhs + 4), vld1q_f32(rhs + 4)); } if (last >= last_aligned + 4) { - v_sum_0 = vfmaq_f32(v_sum_0, vld1q_f32(lhs), vld1q_f32(rhs)); - lhs += 4; - rhs += 4; + v_sum_0 = vfmaq_f32(v_sum_0, vld1q_f32(lhs), vld1q_f32(rhs)); + lhs += 4; + rhs += 4; } float result = vaddvq_f32(vaddq_f32(v_sum_0, v_sum_1)); switch (last - lhs) { - case 3: - FMA_FP32_GENERAL(lhs[2], rhs[2], result) - /* FALLTHRU */ - case 2: - FMA_FP32_GENERAL(lhs[1], rhs[1], result) - /* FALLTHRU */ - case 1: - FMA_FP32_GENERAL(lhs[0], rhs[0], result) + case 3: + FMA_FP32_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(lhs[0], rhs[0], result) } - *distance = result; + *distance = -result; } } // namespace zvec::turbo::armv8::internal From 3e45b87db9fc2611d39c5a2909267f9e4b827a86 Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 13 Apr 2026 17:38:10 +0800 Subject: [PATCH 37/44] fix: fix dist --- src/turbo/armv8/float32/cosine.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/turbo/armv8/float32/cosine.cc b/src/turbo/armv8/float32/cosine.cc index 83d3c717b..09b064d55 100644 --- a/src/turbo/armv8/float32/cosine.cc +++ b/src/turbo/armv8/float32/cosine.cc @@ -25,9 +25,9 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, size_t original_dim = dim - extra_dim; float ip; - inner_product_fp32_distance(a, b, original_dim, &ip); + internal::inner_product_fp32_armv8(a, b, original_dim, &ip); - *distance = 1 + ip; + *distance = 1 - ip; #else (void)a; (void)b; From e26610a866ff6cceac3c696db8211bd537ba99d0 Mon Sep 17 00:00:00 2001 From: ray Date: Mon, 13 Apr 2026 19:15:26 +0800 Subject: [PATCH 38/44] fix: vnni inner product --- src/turbo/armv8/float32/cosine.cc | 2 +- .../record_quantized_int8/inner_product.cc | 61 +++++++++++++++++++ .../record_quantized_int8/inner_product.h | 31 ++++++++++ src/turbo/turbo.cc | 17 ++++-- 4 files changed, 104 insertions(+), 7 deletions(-) create mode 100644 src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc create mode 100644 src/turbo/avx512_vnni/record_quantized_int8/inner_product.h diff --git a/src/turbo/armv8/float32/cosine.cc b/src/turbo/armv8/float32/cosine.cc index 09b064d55..49f191103 100644 --- a/src/turbo/armv8/float32/cosine.cc +++ b/src/turbo/armv8/float32/cosine.cc @@ -19,7 +19,7 @@ namespace zvec::turbo::armv8 { void cosine_fp32_distance(const void *a, const void *b, size_t dim, - float *distance) { + size_t extra_size, float *distance) { #if defined(__ARM_NEON) constexpr size_t extra_dim = 2; size_t original_dim = dim - extra_dim; diff --git a/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc new file mode 100644 index 000000000..09feca80b --- /dev/null +++ b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc @@ -0,0 +1,61 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "avx512_vnni/record_quantized_int8/inner_product.h" +#include +#include "avx512_vnni/record_quantized_int8/common.h" + +namespace zvec::turbo::avx512_vnni { + +// Compute squared Euclidean distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance) { + const size_t original_dim = dim - 20; + + if (original_dim <= 0) { + return; + } + + internal::ip_int8_avx512_vnni(a, b, original_dim, distance); + + const float *a_tail = reinterpret_cast( + reinterpret_cast(a) + original_dim); + const float *b_tail = reinterpret_cast( + reinterpret_cast(b) + original_dim); + + float qa = a_tail[0]; + float qb = a_tail[1]; + float qs = a_tail[2]; + + float ma = b_tail[0]; + float mb = b_tail[1]; + float ms = b_tail[2]; + + *distance = -(ma * qa * *distance + mb * qa * qs + qb * ma * ms + + original_dim * qb * mb); +} + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances) { + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +} + +} // namespace zvec::turbo::avx512_vnni \ No newline at end of file diff --git a/src/turbo/avx512_vnni/record_quantized_int8/inner_product.h b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.h new file mode 100644 index 000000000..25f0ce109 --- /dev/null +++ b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.h @@ -0,0 +1,31 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace zvec::turbo::avx512_vnni { + +// Compute inner product distance between a single quantized int8 +// vector pair. +void inner_product_int8_distance(const void *a, const void *b, size_t dim, + float *distance); + +// Batch version of inner_product_int8_distance. +void inner_product_int8_batch_distance(const void *const *vectors, + const void *query, size_t n, size_t dim, + float *distances); + +} // namespace zvec::turbo::avx512_vnni diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index bb9067851..1fb5dcd7e 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -14,6 +14,12 @@ #include #include +#include "armv8/float32/cosine.h" +#include "armv8/float32/inner_product.h" +#include "armv8/float32/squared_euclidean.h" +#include "armv8/half_float/cosine.h" +#include "armv8/half_float/inner_product.h" +#include "armv8/half_float/squared_euclidean.h" #include "avx/float32/cosine.h" #include "avx/float32/inner_product.h" #include "avx/float32/squared_euclidean.h" @@ -36,6 +42,7 @@ #include "avx512_fp16/half_float/inner_product.h" #include "avx512_fp16/half_float/squared_euclidean.h" #include "avx512_vnni/record_quantized_int8/cosine.h" +#include "avx512_vnni/record_quantized_int8/inner_product.h" #include "avx512_vnni/record_quantized_int8/squared_euclidean.h" #include "scalar/float32/cosine.h" #include "scalar/float32/inner_product.h" @@ -55,12 +62,6 @@ #include "sse/record_quantized_int8/cosine.h" #include "sse/record_quantized_int8/inner_product.h" #include "sse/record_quantized_int8/squared_euclidean.h" -#include "armv8/float32/cosine.h" -#include "armv8/float32/inner_product.h" -#include "armv8/float32/squared_euclidean.h" -#include "armv8/half_float/cosine.h" -#include "armv8/half_float/inner_product.h" -#include "armv8/half_float/squared_euclidean.h" namespace zvec::turbo { @@ -148,6 +149,10 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, if (metric_type == MetricType::kCosine) { return avx512_vnni::cosine_int8_distance; } + + if (metric_type == MetricType::kInnerProduct) { + return avx512_vnni::inner_product_int8_distance; + } } if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2 && From b433e6bde9160af599eaaff29c309f22e5aeb078 Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 14 Apr 2026 12:29:46 +0800 Subject: [PATCH 39/44] fix: fix batch ut --- tests/turbo/turbo_cosine_test.cc | 40 +- tests/turbo/turbo_euclidean_test.cc | 22 +- tests/turbo/turbo_inner_product_test.cc | 22 +- tests/turbo/turbo_quantized_integer_test.cc | 862 ++++++++++++++++++-- 4 files changed, 828 insertions(+), 118 deletions(-) diff --git a/tests/turbo/turbo_cosine_test.cc b/tests/turbo/turbo_cosine_test.cc index f77b5e774..a4f1d3072 100644 --- a/tests/turbo/turbo_cosine_test.cc +++ b/tests/turbo/turbo_cosine_test.cc @@ -28,7 +28,7 @@ TEST(CosineMetric, TestFp32Cosine) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); - const size_t COUNT = 1000; + const size_t COUNT = 1024; auto converter = IndexFactory::CreateConverter("CosineFp32Converter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -55,21 +55,21 @@ TEST(CosineMetric, TestFp32Cosine) { query_vec[j] = dist(gen); } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { doc_vec[j] = dist(gen); } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta qmeta_reformer; - - std::string query_out; - ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, - &qmeta_reformer)); - ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - std::string doc_out; ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, &qmeta_reformer)); @@ -97,7 +97,7 @@ TEST(CosineMetric, TestFp16Cosine) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); - const size_t COUNT = 1000; + const size_t COUNT = 1024; auto converter = IndexFactory::CreateConverter("CosineFp16Converter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -128,21 +128,21 @@ TEST(CosineMetric, TestFp16Cosine) { query_vec[j] = dist(gen); } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { doc_vec[j] = dist(gen); } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta qmeta_reformer; - - std::string query_out; - ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, - &qmeta_reformer)); - ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - std::string doc_out; ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, &qmeta_reformer)); diff --git a/tests/turbo/turbo_euclidean_test.cc b/tests/turbo/turbo_euclidean_test.cc index 51f9bad49..c472b33ab 100644 --- a/tests/turbo/turbo_euclidean_test.cc +++ b/tests/turbo/turbo_euclidean_test.cc @@ -27,7 +27,7 @@ TEST(SquaredEuclideanMetric, TestFp32SquaredEuclidean) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); - const size_t COUNT = 1000; + const size_t COUNT = 1024; auto func_avx512 = turbo::get_distance_func( turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, @@ -74,7 +74,7 @@ TEST(SquaredEuclideanMetric, TestFp16SquaredEuclidean) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); - const size_t COUNT = 1000; + const size_t COUNT = 1024; auto converter = IndexFactory::CreateConverter("HalfFloatConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -105,21 +105,21 @@ TEST(SquaredEuclideanMetric, TestFp16SquaredEuclidean) { query_vec[j] = dist(gen); } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { doc_vec[j] = dist(gen); } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta qmeta_reformer; - - std::string query_out; - ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, - &qmeta_reformer)); - ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - std::string doc_out; ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, &qmeta_reformer)); diff --git a/tests/turbo/turbo_inner_product_test.cc b/tests/turbo/turbo_inner_product_test.cc index ff0fa8144..8aaa1f422 100644 --- a/tests/turbo/turbo_inner_product_test.cc +++ b/tests/turbo/turbo_inner_product_test.cc @@ -27,7 +27,7 @@ TEST(InnerProductMetric, TestFp32InnerProduct) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); - const size_t COUNT = 1000; + const size_t COUNT = 1024; auto func_avx512 = turbo::get_distance_func( turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, @@ -74,7 +74,7 @@ TEST(InnerProductMetric, TestFp16InnerProduct) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); - const size_t COUNT = 1000; + const size_t COUNT = 1024; auto converter = IndexFactory::CreateConverter("HalfFloatConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -105,21 +105,21 @@ TEST(InnerProductMetric, TestFp16InnerProduct) { query_vec[j] = dist(gen); } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { doc_vec[j] = dist(gen); } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta qmeta_reformer; - - std::string query_out; - ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, - &qmeta_reformer)); - ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - std::string doc_out; ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, &qmeta_reformer)); diff --git a/tests/turbo/turbo_quantized_integer_test.cc b/tests/turbo/turbo_quantized_integer_test.cc index 252b2e278..a31dbcbd4 100644 --- a/tests/turbo/turbo_quantized_integer_test.cc +++ b/tests/turbo/turbo_quantized_integer_test.cc @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,7 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); - const size_t COUNT = 1000; + const size_t COUNT = 1024; auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -68,21 +69,21 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProduct) { query_vec[j] = dist(gen); } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { doc_vec[j] = dist(gen); } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta qmeta_reformer; - - std::string query_out; - ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, - &qmeta_reformer)); - ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - std::string doc_out; ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, &qmeta_reformer)); @@ -123,7 +124,7 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; - const size_t COUNT = 1000; + const size_t COUNT = 1024; auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -155,21 +156,21 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProduct) { query_vec[j] = dist(gen); } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { doc_vec[j] = dist(gen); } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta qmeta_reformer; - - std::string query_out; - ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, - &qmeta_reformer)); - ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - std::string doc_out; ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, &qmeta_reformer)); @@ -205,7 +206,7 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); - const size_t COUNT = 1000; + const size_t COUNT = 1024; auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -237,21 +238,21 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclidean) { query_vec[j] = dist(gen); } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { doc_vec[j] = dist(gen); } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta qmeta_reformer; - - std::string query_out; - ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, - &qmeta_reformer)); - ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - std::string doc_out; ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, &qmeta_reformer)); @@ -287,7 +288,7 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; - const size_t COUNT = 1000; + const size_t COUNT = 1024; auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -319,21 +320,21 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclidean) { query_vec[j] = dist(gen); } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { doc_vec[j] = dist(gen); } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta qmeta_reformer; - - std::string query_out; - ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, - &qmeta_reformer)); - ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - std::string doc_out; ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, &qmeta_reformer)); @@ -369,7 +370,7 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); - const size_t COUNT = 1000; + const size_t COUNT = 1024; IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); meta.set_metric("Cosine", 0, Params()); @@ -418,28 +419,34 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) { query_vec[j] = dist(gen); } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta fp32_qmeta_reformer; + + std::string fp32_query_out; + ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta, + &fp32_query_out, &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { doc_vec[j] = dist(gen); } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta fp32_qmeta_reformer; - float score_float32{0.0f}; float score_scalar{0.0f}; float score_avx512vnni{0.0f}; float score_avx2{0.0f}; float score_sse{0.0f}; - std::string fp32_query_out; - ASSERT_EQ(0, - fp32_reformer->transform(query_vec.data(), qmeta, &fp32_query_out, - &fp32_qmeta_reformer)); - ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); - std::string fp32_doc_out; ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out, &fp32_qmeta_reformer)); @@ -448,13 +455,6 @@ TEST(QuantizedIntegerMetric, TestInt8Cosine) { func_float32(fp32_query_out.data(), fp32_doc_out.data(), fp32_qmeta_reformer.dimension(), &score_float32); - IndexQueryMeta qmeta_reformer; - - std::string query_out; - ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, - &qmeta_reformer)); - ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - std::string doc_out; ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, &qmeta_reformer)); @@ -487,7 +487,7 @@ TEST(QuantizedIntegerMetric, TestInt4Cosine) { std::uniform_real_distribution dist(-1.0, 2.0); const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; - const size_t COUNT = 1000; + const size_t COUNT = 1024; IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); meta.set_metric("Cosine", 0, Params()); @@ -531,27 +531,33 @@ TEST(QuantizedIntegerMetric, TestInt4Cosine) { query_vec[j] = dist(gen); } + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta fp32_qmeta_reformer; + + std::string fp32_query_out; + ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta, + &fp32_query_out, &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + for (size_t i = 0; i < COUNT; ++i) { ailego::NumericalVector doc_vec(DIMENSION); for (size_t j = 0; j < DIMENSION; ++j) { doc_vec[j] = dist(gen); } - IndexQueryMeta qmeta; - qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); - IndexQueryMeta fp32_qmeta_reformer; - float score_float32{0.0f}; float score_scalar{0.0f}; float score_avx2{0.0f}; float score_sse{0.0f}; - std::string fp32_query_out; - ASSERT_EQ(0, - fp32_reformer->transform(query_vec.data(), qmeta, &fp32_query_out, - &fp32_qmeta_reformer)); - ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); - std::string fp32_doc_out; ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out, &fp32_qmeta_reformer)); @@ -560,13 +566,6 @@ TEST(QuantizedIntegerMetric, TestInt4Cosine) { func_float32(fp32_query_out.data(), fp32_doc_out.data(), fp32_qmeta_reformer.dimension(), &score_float32); - IndexQueryMeta qmeta_reformer; - - std::string query_out; - ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, - &qmeta_reformer)); - ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); - std::string doc_out; ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, &qmeta_reformer)); @@ -588,3 +587,714 @@ TEST(QuantizedIntegerMetric, TestInt4Cosine) { ASSERT_NEAR(score_scalar, score_sse, 0.001); } } + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt8InnerProductBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 128; + + auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx512vnni = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector scores_float32(BATCH_SIZE, 0.0f); + std::vector scores_scalar(BATCH_SIZE, 0.0f); + std::vector scores_avx512vnni(BATCH_SIZE, 0.0f); + std::vector scores_avx2(BATCH_SIZE, 0.0f); + std::vector scores_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector float_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + float_ptrs[k] = doc_vecs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE, + DIMENSION, &scores_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_scalar[0]); + + batch_func_avx512vnni(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_avx512vnni[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(scores_float32[j], scores_avx512vnni[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_scalar[j], scores_avx2[j], 0.001); + ASSERT_NEAR(scores_scalar[j], scores_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt4InnerProductBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 128; + + auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_outs.size() == BATCH_SIZE) { + std::vector scores_float32(BATCH_SIZE, 0.0f); + std::vector scores_scalar(BATCH_SIZE, 0.0f); + std::vector scores_avx2(BATCH_SIZE, 0.0f); + std::vector scores_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector float_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + float_ptrs[k] = doc_vecs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE, + DIMENSION, &scores_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_scalar[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_scalar[j], scores_avx2[j], 0.001); + ASSERT_NEAR(scores_scalar[j], scores_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt8SquaredEuclideanBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 128; + + auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_outs.size() == BATCH_SIZE) { + std::vector scores_float32(BATCH_SIZE, 0.0f); + std::vector scores_scalar(BATCH_SIZE, 0.0f); + std::vector scores_avx2(BATCH_SIZE, 0.0f); + std::vector scores_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector float_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + float_ptrs[k] = doc_vecs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE, + DIMENSION, &scores_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_scalar[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_scalar[j], scores_avx2[j], 0.001); + ASSERT_NEAR(scores_scalar[j], scores_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt4SquaredEuclideanBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 128; + + auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_outs.size() == BATCH_SIZE) { + std::vector scores_float32(BATCH_SIZE, 0.0f); + std::vector scores_scalar(BATCH_SIZE, 0.0f); + std::vector scores_avx2(BATCH_SIZE, 0.0f); + std::vector scores_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector float_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + float_ptrs[k] = doc_vecs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(float_ptrs.data(), query_vec.data(), BATCH_SIZE, + DIMENSION, &scores_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_scalar[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &scores_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(scores_float32[j], scores_avx2[j], 0.001); + ASSERT_NEAR(scores_float32[j], scores_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt8CosineBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 128; + + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + + // fp32 converter + auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter"); + ASSERT_TRUE(!!fp32_converter); + ASSERT_EQ(0u, fp32_converter->init(meta, Params())); + + auto &fp32_convert_meta = fp32_converter->meta(); + auto fp32_reformer = + IndexFactory::CreateReformer(fp32_convert_meta.reformer_name()); + ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params())); + + // int8 converter + auto converter = IndexFactory::CreateConverter("CosineInt8Converter"); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx512vnni = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512VNNI); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta fp32_qmeta_reformer; + + std::string fp32_query_out; + ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta, + &fp32_query_out, &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + IndexQueryMeta qmeta_reformer; + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + std::vector fp32_doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string fp32_doc_out; + ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out, + &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + fp32_doc_outs.push_back(fp32_doc_out); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_outs.size() == BATCH_SIZE) { + std::vector score_float32(BATCH_SIZE, 0.0f); + std::vector score_scalar(BATCH_SIZE, 0.0f); + std::vector score_avx512vnni(BATCH_SIZE, 0.0f); + std::vector score_avx2(BATCH_SIZE, 0.0f); + std::vector score_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector fp32_doc_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + fp32_doc_ptrs[k] = fp32_doc_outs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(fp32_doc_ptrs.data(), fp32_query_out.data(), + BATCH_SIZE, fp32_qmeta_reformer.dimension(), + &score_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_scalar[0]); + + batch_func_avx512vnni(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_avx512vnni[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(score_float32[j], score_avx512vnni[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_float32[j], score_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_float32[j], score_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_float32[j], score_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar[j], score_avx2[j], 0.001); + ASSERT_NEAR(score_scalar[j], score_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + fp32_doc_outs.clear(); + } + } +} + +// Target Test Type: avx2, sse, scalar +TEST(QuantizedIntegerMetric, TestInt4CosineBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 128; + + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + + // fp32 converter + auto fp32_converter = IndexFactory::CreateConverter("CosineFp32Converter"); + ASSERT_TRUE(!!fp32_converter); + ASSERT_EQ(0u, fp32_converter->init(meta, Params())); + + auto &fp32_convert_meta = fp32_converter->meta(); + auto fp32_reformer = + IndexFactory::CreateReformer(fp32_convert_meta.reformer_name()); + ASSERT_EQ(0, fp32_reformer->init(fp32_convert_meta.reformer_params())); + + // int4 converter + auto converter = IndexFactory::CreateConverter("CosineInt4Converter"); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + ASSERT_EQ(0, reformer->init(convert_meta.reformer_params())); + + auto batch_func_float32 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAuto); + + auto batch_func_avx2 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX2); + + auto batch_func_sse = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kSSE); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kInt4, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta fp32_qmeta_reformer; + + std::string fp32_query_out; + ASSERT_EQ(0, fp32_reformer->transform(query_vec.data(), qmeta, + &fp32_query_out, &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + IndexQueryMeta qmeta_reformer; + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + std::vector fp32_doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string fp32_doc_out; + ASSERT_EQ(0, fp32_reformer->transform(doc_vec.data(), qmeta, &fp32_doc_out, + &fp32_qmeta_reformer)); + ASSERT_EQ(fp32_qmeta_reformer.dimension(), fp32_convert_meta.dimension()); + + fp32_doc_outs.push_back(fp32_doc_out); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_outs.size() == BATCH_SIZE) { + std::vector score_float32(BATCH_SIZE, 0.0f); + std::vector score_scalar(BATCH_SIZE, 0.0f); + std::vector score_avx2(BATCH_SIZE, 0.0f); + std::vector score_sse(BATCH_SIZE, 0.0f); + + // Build pointer arrays for batch functions + std::vector fp32_doc_ptrs(BATCH_SIZE); + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + fp32_doc_ptrs[k] = fp32_doc_outs[k].data(); + doc_ptrs[k] = doc_outs[k].data(); + } + + batch_func_float32(fp32_doc_ptrs.data(), fp32_query_out.data(), + BATCH_SIZE, fp32_qmeta_reformer.dimension(), + &score_float32[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_scalar[0]); + + batch_func_avx2(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_avx2[0]); + + batch_func_sse(doc_ptrs.data(), query_out.data(), BATCH_SIZE, + qmeta_reformer.dimension(), &score_sse[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + ASSERT_NEAR(score_float32[j], score_avx2[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_float32[j], score_sse[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_float32[j], score_scalar[j], 0.2 * DIMENSION); + ASSERT_NEAR(score_scalar[j], score_avx2[j], 0.001); + ASSERT_NEAR(score_scalar[j], score_sse[j], 0.001); + } + + doc_outs.clear(); + doc_vecs.clear(); + fp32_doc_outs.clear(); + } + } +} \ No newline at end of file From 36c4f4c04085d11141f072fb67f77e96bdd67f5f Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 14 Apr 2026 16:44:53 +0800 Subject: [PATCH 40/44] feat: add batch ut --- tests/turbo/turbo_cosine_test.cc | 193 ++++++++++++++++++++ tests/turbo/turbo_euclidean_test.cc | 166 +++++++++++++++++ tests/turbo/turbo_inner_product_test.cc | 167 +++++++++++++++++ tests/turbo/turbo_quantized_integer_test.cc | 12 +- 4 files changed, 532 insertions(+), 6 deletions(-) diff --git a/tests/turbo/turbo_cosine_test.cc b/tests/turbo/turbo_cosine_test.cc index a4f1d3072..ece33613d 100644 --- a/tests/turbo/turbo_cosine_test.cc +++ b/tests/turbo/turbo_cosine_test.cc @@ -171,3 +171,196 @@ TEST(CosineMetric, TestFp16Cosine) { ASSERT_NEAR(score_scalar, score_avx, epsilon); } } + +// Target Test Type: avx, avx512, scalar +TEST(CosineMetric, TestFp32CosineBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("CosineFp32Converter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_vecs[k].data(); + } + + std::vector score_scalar(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + + batch_func_scalar(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_scalar[0]); + + batch_func_avx512(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_avx512[0]); + + batch_func_avx(doc_ptrs.data(), query_vec.data(), DIMENSION, BATCH_SIZE, + &score_avx[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.001; + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + doc_outs.clear(); + } + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(CosineMetric, TestFp16CosineBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("CosineFp16Converter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("Cosine", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto batch_func_avx512fp16 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kCosine, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + doc_outs.push_back(doc_out); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_outs[k].data(); + } + + std::vector score_avx512fp16(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_scalar(BATCH_SIZE, 0.0f); + + batch_func_avx512fp16(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512fp16[0]); + + batch_func_avx512(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512[0]); + + batch_func_avx(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, &score_avx[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_scalar[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.2; + ASSERT_NEAR(score_scalar[j], score_avx512fp16[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + doc_outs.clear(); + } + } +} diff --git a/tests/turbo/turbo_euclidean_test.cc b/tests/turbo/turbo_euclidean_test.cc index c472b33ab..8388489f4 100644 --- a/tests/turbo/turbo_euclidean_test.cc +++ b/tests/turbo/turbo_euclidean_test.cc @@ -148,3 +148,169 @@ TEST(SquaredEuclideanMetric, TestFp16SquaredEuclidean) { ASSERT_NEAR(score_scalar, score_avx, epsilon); } } + +// Target Test Type: avx, avx512, scalar +TEST(SquaredEuclideanMetric, TestFp32SquaredEuclideanBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + std::vector> doc_vecs; + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + doc_vecs.push_back(doc_vec); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_vecs[k].data(); + } + + std::vector score_scalar(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + + batch_func_scalar(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_scalar[0]); + + batch_func_avx512(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_avx512[0]); + + batch_func_avx(doc_ptrs.data(), query_vec.data(), DIMENSION, BATCH_SIZE, + &score_avx[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.001; + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(SquaredEuclideanMetric, TestFp16SquaredEuclideanBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("HalfFloatConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("SquaredEuclidean", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto batch_func_avx512fp16 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + doc_outs.push_back(doc_out); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_outs[k].data(); + } + + std::vector score_avx512fp16(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_scalar(BATCH_SIZE, 0.0f); + + batch_func_avx512fp16(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512fp16[0]); + + batch_func_avx512(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512[0]); + + batch_func_avx(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, &score_avx[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_scalar[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.2; + ASSERT_NEAR(score_scalar[j], score_avx512fp16[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + doc_outs.clear(); + } + } +} diff --git a/tests/turbo/turbo_inner_product_test.cc b/tests/turbo/turbo_inner_product_test.cc index 8aaa1f422..14fc2cfc0 100644 --- a/tests/turbo/turbo_inner_product_test.cc +++ b/tests/turbo/turbo_inner_product_test.cc @@ -148,3 +148,170 @@ TEST(InnerProductMetric, TestFp16InnerProduct) { ASSERT_NEAR(score_scalar, score_avx, epsilon); } } + +// Target Test Type: avx, avx512, scalar +TEST(InnerProductMetric, TestFp32InnerProductBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp32, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + std::vector> doc_vecs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_vecs[k].data(); + } + + std::vector score_scalar(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + + batch_func_scalar(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_scalar[0]); + batch_func_avx512(doc_ptrs.data(), query_vec.data(), DIMENSION, + BATCH_SIZE, &score_avx512[0]); + batch_func_avx(doc_ptrs.data(), query_vec.data(), DIMENSION, BATCH_SIZE, + &score_avx[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.001; + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + } + } +} + +// Target Test Type: avx, avx512, avx512fp16, scalar +TEST(InnerProductMetric, TestFp16InnerProductBatch) { + std::mt19937 gen(15583); + std::uniform_real_distribution dist(-1.0, 2.0); + + const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); + const size_t COUNT = 1024; + const size_t BATCH_SIZE = 16; + + auto converter = IndexFactory::CreateConverter("HalfFloatConverter"); + IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); + meta.set_metric("InnerProduct", 0, Params()); + ASSERT_TRUE(!!converter); + ASSERT_EQ(0u, converter->init(meta, Params())); + auto &convert_meta = converter->meta(); + auto reformer = IndexFactory::CreateReformer(convert_meta.reformer_name()); + + auto batch_func_avx512fp16 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512FP16); + + auto batch_func_avx512 = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX512); + + auto batch_func_avx = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kAVX); + + auto batch_func_scalar = turbo::get_batch_distance_func( + turbo::MetricType::kInnerProduct, turbo::DataType::kFp16, + turbo::QuantizeType::kDefault, turbo::CpuArchType::kScalar); + + ailego::NumericalVector query_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + query_vec[j] = dist(gen); + } + + IndexQueryMeta qmeta; + qmeta.set_meta(IndexMeta::DT_FP32, DIMENSION); + IndexQueryMeta qmeta_reformer; + + std::string query_out; + ASSERT_EQ(0, reformer->transform(query_vec.data(), qmeta, &query_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + + std::vector> doc_vecs; + std::vector doc_outs; + + for (size_t i = 0; i < COUNT; ++i) { + ailego::NumericalVector doc_vec(DIMENSION); + for (size_t j = 0; j < DIMENSION; ++j) { + doc_vec[j] = dist(gen); + } + + doc_vecs.push_back(doc_vec); + + std::string doc_out; + ASSERT_EQ(0, reformer->transform(doc_vec.data(), qmeta, &doc_out, + &qmeta_reformer)); + ASSERT_EQ(qmeta_reformer.dimension(), convert_meta.dimension()); + doc_outs.push_back(doc_out); + + if (doc_vecs.size() == BATCH_SIZE) { + std::vector doc_ptrs(BATCH_SIZE); + for (size_t k = 0; k < BATCH_SIZE; ++k) { + doc_ptrs[k] = doc_outs[k].data(); + } + + std::vector score_avx512fp16(BATCH_SIZE, 0.0f); + std::vector score_avx512(BATCH_SIZE, 0.0f); + std::vector score_avx(BATCH_SIZE, 0.0f); + std::vector score_scalar(BATCH_SIZE, 0.0f); + + batch_func_avx512fp16(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512fp16[0]); + + batch_func_avx512(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_avx512[0]); + + batch_func_avx(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, &score_avx[0]); + + batch_func_scalar(doc_ptrs.data(), query_out.data(), + qmeta_reformer.dimension(), BATCH_SIZE, + &score_scalar[0]); + + for (size_t j = 0; j < BATCH_SIZE; ++j) { + float epsilon = 0.2; + ASSERT_NEAR(score_scalar[j], score_avx512fp16[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx512[j], epsilon); + ASSERT_NEAR(score_scalar[j], score_avx[j], epsilon); + } + + doc_vecs.clear(); + doc_outs.clear(); + } + } +} diff --git a/tests/turbo/turbo_quantized_integer_test.cc b/tests/turbo/turbo_quantized_integer_test.cc index a31dbcbd4..3394a27a0 100644 --- a/tests/turbo/turbo_quantized_integer_test.cc +++ b/tests/turbo/turbo_quantized_integer_test.cc @@ -595,7 +595,7 @@ TEST(QuantizedIntegerMetric, TestInt8InnerProductBatch) { const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); const size_t COUNT = 1024; - const size_t BATCH_SIZE = 128; + const size_t BATCH_SIZE = 16; auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -710,7 +710,7 @@ TEST(QuantizedIntegerMetric, TestInt4InnerProductBatch) { const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; const size_t COUNT = 1024; - const size_t BATCH_SIZE = 128; + const size_t BATCH_SIZE = 16; auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -816,7 +816,7 @@ TEST(QuantizedIntegerMetric, TestInt8SquaredEuclideanBatch) { const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); const size_t COUNT = 1024; - const size_t BATCH_SIZE = 128; + const size_t BATCH_SIZE = 16; auto converter = IndexFactory::CreateConverter("Int8StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -922,7 +922,7 @@ TEST(QuantizedIntegerMetric, TestInt4SquaredEuclideanBatch) { const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; const size_t COUNT = 1024; - const size_t BATCH_SIZE = 128; + const size_t BATCH_SIZE = 16; auto converter = IndexFactory::CreateConverter("Int4StreamingConverter"); IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); @@ -1028,7 +1028,7 @@ TEST(QuantizedIntegerMetric, TestInt8CosineBatch) { const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen); const size_t COUNT = 1024; - const size_t BATCH_SIZE = 128; + const size_t BATCH_SIZE = 16; IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); meta.set_metric("Cosine", 0, Params()); @@ -1172,7 +1172,7 @@ TEST(QuantizedIntegerMetric, TestInt4CosineBatch) { const size_t DIMENSION = std::uniform_int_distribution(1, 128)(gen) * 2; const size_t COUNT = 1024; - const size_t BATCH_SIZE = 128; + const size_t BATCH_SIZE = 16; IndexMeta meta(IndexMeta::DT_FP32, DIMENSION); meta.set_metric("Cosine", 0, Params()); From 895cd78910f90e492ad53637f7809b4a354df43e Mon Sep 17 00:00:00 2001 From: ray Date: Tue, 14 Apr 2026 20:03:30 +0800 Subject: [PATCH 41/44] feat: add batch dist --- src/turbo/armv8/float32/cosine.cc | 10 +++ src/turbo/armv8/float32/inner_product.cc | 4 + .../armv8/float32/inner_product_common.h | 75 ++++++++++++++++++ src/turbo/armv8/float32/squared_euclidean.cc | 3 +- .../armv8/float32/squared_euclidean_common.h | 76 +++++++++++++++++++ src/turbo/armv8/half_float/inner_product.cc | 4 + .../armv8/half_float/squared_euclidean.cc | 1 + src/turbo/avx/float32/cosine.cc | 10 +++ src/turbo/avx/float32/inner_product.cc | 6 +- src/turbo/avx/float32/squared_euclidean.cc | 5 +- src/turbo/avx/half_float/cosine.cc | 2 +- src/turbo/avx/half_float/inner_product.cc | 4 + src/turbo/avx/half_float/squared_euclidean.cc | 1 + .../record_quantized_int4/inner_product.cc | 2 +- src/turbo/avx512/float32/cosine.cc | 2 +- src/turbo/avx512/float32/squared_euclidean.cc | 1 + src/turbo/avx512/half_float/inner_product.cc | 4 + .../avx512/half_float/squared_euclidean.cc | 1 + src/turbo/avx512_fp16/half_float/cosine.cc | 2 +- .../avx512_fp16/half_float/inner_product.cc | 4 + .../half_float/squared_euclidean.cc | 5 +- .../record_quantized_int8/inner_product.cc | 4 + src/turbo/scalar/float32/cosine.cc | 7 +- src/turbo/scalar/float32/inner_product.cc | 6 +- src/turbo/scalar/float32/squared_euclidean.cc | 6 +- src/turbo/scalar/half_float/cosine.cc | 6 +- src/turbo/scalar/half_float/inner_product.cc | 6 +- .../scalar/half_float/squared_euclidean.cc | 6 +- .../scalar/record_quantized_int4/cosine.cc | 8 +- .../record_quantized_int4/inner_product.cc | 8 +- .../squared_euclidean.cc | 8 +- .../scalar/record_quantized_int8/cosine.cc | 8 +- .../record_quantized_int8/inner_product.cc | 8 +- .../squared_euclidean.cc | 8 +- 34 files changed, 265 insertions(+), 46 deletions(-) diff --git a/src/turbo/armv8/float32/cosine.cc b/src/turbo/armv8/float32/cosine.cc index 49f191103..7e2b990d7 100644 --- a/src/turbo/armv8/float32/cosine.cc +++ b/src/turbo/armv8/float32/cosine.cc @@ -39,7 +39,17 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, void cosine_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__ARM_NEON) + const int original_dim = dim - 1; + if (original_dim <= 0) { + return; + } + internal::inner_product_fp32_batch_armv8(vectors, query, n, original_dim, + distances); + + for (int i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } #else (void)vectors; (void)query; diff --git a/src/turbo/armv8/float32/inner_product.cc b/src/turbo/armv8/float32/inner_product.cc index dbc5a3048..7cfbd7784 100644 --- a/src/turbo/armv8/float32/inner_product.cc +++ b/src/turbo/armv8/float32/inner_product.cc @@ -38,11 +38,15 @@ void inner_product_fp32_distance(const void *a, const void *b, size_t dim, void inner_product_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { +#if defined(__ARM_NEON) + inner_product_fp32_batch_armv8(vectors, query, n, dim, distances); +#else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; +#endif // __ARM_NEON } } // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/float32/inner_product_common.h b/src/turbo/armv8/float32/inner_product_common.h index fe75269ed..26ad45d21 100644 --- a/src/turbo/armv8/float32/inner_product_common.h +++ b/src/turbo/armv8/float32/inner_product_common.h @@ -62,6 +62,81 @@ static __attribute__((always_inline)) void inner_product_fp32_armv8( *distance = -result; } +template +static __attribute__((always_inline)) void inner_product_fp32_batch_armv8_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vdupq_n_f32(0); + } + + size_t dim = 0; + for (; dim + 64 <= dimensionality; dim += 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32( + v_sum[i], vld1q_f32(reinterpret_cast(query) + dim), + vld1q_f32(reinterpret_cast(vectors[i]) + dim)); + } + } + + if (dim >= dimensionality + 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast(query)+dim), vld1q_f32(reinterpret_cast(vectors[i])+dim))); + } + + dim += 4; + } + + for (size_t i = 0; i < batch_size; ++i) { + float result = vaddvq_f32(v_sum[i]); + switch (last - lhs) { + case 3: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 2], + reinterpret_cast(vectors[i])[dim + 2], + result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 1], + reinterpret_cast(vectors[i])[dim + 1], + result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 0], + reinterpret_cast(vectors[i])[dim + 0], + result) + } + + distances[i] = -result; + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void inner_product_fp32_batch_armv8( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_fp32_batch_armv8_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_fp32_batch_armv8_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); + } +} + } // namespace zvec::turbo::armv8::internal #endif // defined(__ARM_NEON) diff --git a/src/turbo/armv8/float32/squared_euclidean.cc b/src/turbo/armv8/float32/squared_euclidean.cc index a2803d9ae..b39fdac2e 100644 --- a/src/turbo/armv8/float32/squared_euclidean.cc +++ b/src/turbo/armv8/float32/squared_euclidean.cc @@ -41,13 +41,14 @@ void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__ARM_NEON) + squared_euclidean_fp32_batch_armv8(vectors, query, n, dim, distances); #else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; -#endif //__ARM_NEON +#endif // } } // namespace zvec::turbo::armv8 \ No newline at end of file diff --git a/src/turbo/armv8/float32/squared_euclidean_common.h b/src/turbo/armv8/float32/squared_euclidean_common.h index a1dd4643d..4f3419c56 100644 --- a/src/turbo/armv8/float32/squared_euclidean_common.h +++ b/src/turbo/armv8/float32/squared_euclidean_common.h @@ -69,6 +69,82 @@ static __attribute__((always_inline)) void squared_euclidean_fp32_armv8( *distance = result; } +template +static __attribute__((always_inline)) void +squared_euclidean_fp32_batch_armv8_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vdupq_n_f32(0); + } + + size_t dim = 0; + for (; dim + 64 <= dimensionality; dim += 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32( + v_sum[i], vld1q_f32(reinterpret_cast(query) + dim), + vld1q_f32(reinterpret_cast(vectors[i]) + dim)); + } + } + + if (dim >= dimensionality + 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast(query)+dim), vld1q_f32(reinterpret_cast(vectors[i])+dim))); + } + + dim += 4; + } + + for (size_t i = 0; i < batch_size; ++i) { + float result = vaddvq_f32(v_sum[i]); + switch (last - lhs) { + case 3: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 2], + reinterpret_cast(vectors[i])[dim + 2], + result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 1], + reinterpret_cast(vectors[i])[dim + 1], + result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 0], + reinterpret_cast(vectors[i])[dim + 0], + result) + } + + distances[i] = -result; + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void squared_euclidean_fp32_batch_armv8( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + squared_euclidean_fp32_batch_armv8_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + squared_euclidean_fp32_batch_armv8_impl<1>( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } +} + } // namespace zvec::turbo::armv8::internal #endif // defined(__ARM_NEON) diff --git a/src/turbo/armv8/half_float/inner_product.cc b/src/turbo/armv8/half_float/inner_product.cc index 03831a986..7e0dcc448 100644 --- a/src/turbo/armv8/half_float/inner_product.cc +++ b/src/turbo/armv8/half_float/inner_product.cc @@ -44,11 +44,15 @@ void inner_product_fp16_distance(const void *a, const void *b, size_t dim, void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { +#if defined(__ARM_NEON) + inner_product_fp16_batch_armv8(vectors, query, n, dim, distances); +#else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; +#endif //__ARM_NEON } } // namespace zvec::turbo::armv8 diff --git a/src/turbo/armv8/half_float/squared_euclidean.cc b/src/turbo/armv8/half_float/squared_euclidean.cc index 8f197cad9..5f6ac829b 100644 --- a/src/turbo/armv8/half_float/squared_euclidean.cc +++ b/src/turbo/armv8/half_float/squared_euclidean.cc @@ -46,6 +46,7 @@ void squared_euclidean_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__ARM_NEON) + squared_euclidean_fp16_batch_armv8(vectors, query, n, dim, distances); #else (void)vectors; (void)query; diff --git a/src/turbo/avx/float32/cosine.cc b/src/turbo/avx/float32/cosine.cc index 42e858df3..488fadc20 100644 --- a/src/turbo/avx/float32/cosine.cc +++ b/src/turbo/avx/float32/cosine.cc @@ -43,7 +43,17 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, void cosine_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX__) + const int original_dim = dim - 1; + if (original_dim <= 0) { + return; + } + internal::inner_product_fp32_batch_avx(vectors, query, n, original_dim, + distances); + + for (int i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx/float32/inner_product.cc b/src/turbo/avx/float32/inner_product.cc index 94ed2b0cd..10b30eee3 100644 --- a/src/turbo/avx/float32/inner_product.cc +++ b/src/turbo/avx/float32/inner_product.cc @@ -106,11 +106,15 @@ void inner_product_fp32_distance(const void *a, const void *b, size_t dim, void inner_product_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { +#if defined(__AVX__) + inner_product_fp32_batch_avx(vectors, query, n, dim, distances); +#else (void)vectors; + (void)distances; (void)query; (void)n; (void)dim; - (void)distances; +#endif // __AVX__ } } // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/float32/squared_euclidean.cc b/src/turbo/avx/float32/squared_euclidean.cc index a74856b60..19e81abb0 100644 --- a/src/turbo/avx/float32/squared_euclidean.cc +++ b/src/turbo/avx/float32/squared_euclidean.cc @@ -106,13 +106,14 @@ void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX__) + squared_euclidean_fp32_batch_avx(vectors, query, n, dim, distances); #else (void)vectors; + (void)distances; (void)query; (void)n; (void)dim; - (void)distances; -#endif //__AVX__ +#endif // __AVX__ } } // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/half_float/cosine.cc b/src/turbo/avx/half_float/cosine.cc index 3500907ac..af68a7d8a 100644 --- a/src/turbo/avx/half_float/cosine.cc +++ b/src/turbo/avx/half_float/cosine.cc @@ -43,7 +43,7 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, void cosine_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX__) - + cosine_fp16_batch_avx(vectors, query, n, dim, distances); #else (void)vectors; (void)query; diff --git a/src/turbo/avx/half_float/inner_product.cc b/src/turbo/avx/half_float/inner_product.cc index 9ef2fadd5..44a72dbaa 100644 --- a/src/turbo/avx/half_float/inner_product.cc +++ b/src/turbo/avx/half_float/inner_product.cc @@ -42,11 +42,15 @@ void inner_product_fp16_distance(const void *a, const void *b, size_t dim, void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { +#if defined(__AVX__) + inner_product_fp16_batch_avx(vectors, query, n, dim, distances); +#else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; +#endif // __AVX__ } } // namespace zvec::turbo::avx \ No newline at end of file diff --git a/src/turbo/avx/half_float/squared_euclidean.cc b/src/turbo/avx/half_float/squared_euclidean.cc index 4b7c700b2..222ec1176 100644 --- a/src/turbo/avx/half_float/squared_euclidean.cc +++ b/src/turbo/avx/half_float/squared_euclidean.cc @@ -40,6 +40,7 @@ void squared_euclidean_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX__) + squared_euclidean_fp16_batch_avx(vectors, query, n, dim, distances); #else (void)vectors; (void)query; diff --git a/src/turbo/avx2/record_quantized_int4/inner_product.cc b/src/turbo/avx2/record_quantized_int4/inner_product.cc index 5d98e995c..4db9e7e61 100644 --- a/src/turbo/avx2/record_quantized_int4/inner_product.cc +++ b/src/turbo/avx2/record_quantized_int4/inner_product.cc @@ -63,7 +63,7 @@ void inner_product_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX2__) - + inner_product_int4_batch_avx2(vectors, query, n, dim, distances); #else (void)vectors; (void)query; diff --git a/src/turbo/avx512/float32/cosine.cc b/src/turbo/avx512/float32/cosine.cc index 78ee5e4a7..55c48c7bf 100644 --- a/src/turbo/avx512/float32/cosine.cc +++ b/src/turbo/avx512/float32/cosine.cc @@ -43,7 +43,7 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, void cosine_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512F__) - + cosine_fp32_batch_avx512(vectors, query, n, dim, distances); #else (void)vectors; (void)query; diff --git a/src/turbo/avx512/float32/squared_euclidean.cc b/src/turbo/avx512/float32/squared_euclidean.cc index 8f492e0fb..03e0120d6 100644 --- a/src/turbo/avx512/float32/squared_euclidean.cc +++ b/src/turbo/avx512/float32/squared_euclidean.cc @@ -90,6 +90,7 @@ void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512F__) + squared_euclidean_fp32_batch_avx512(vectors, query, n, dim, distances); #else (void)vectors; (void)query; diff --git a/src/turbo/avx512/half_float/inner_product.cc b/src/turbo/avx512/half_float/inner_product.cc index 74611de3a..058b522a9 100644 --- a/src/turbo/avx512/half_float/inner_product.cc +++ b/src/turbo/avx512/half_float/inner_product.cc @@ -43,11 +43,15 @@ void inner_product_fp16_distance(const void *a, const void *b, size_t dim, void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { +#if defined(__AVX512F__) + inner_product_fp16_batch_avx512(vectors, query, n, dim, distances); +#else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; +#endif } } // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/half_float/squared_euclidean.cc b/src/turbo/avx512/half_float/squared_euclidean.cc index 8fceea89a..0569b4d6c 100644 --- a/src/turbo/avx512/half_float/squared_euclidean.cc +++ b/src/turbo/avx512/half_float/squared_euclidean.cc @@ -46,6 +46,7 @@ void squared_euclidean_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512F__) + squared_euclidean_fp16_batch_avx512(vectors, query, n, dim, distances); #else (void)vectors; (void)query; diff --git a/src/turbo/avx512_fp16/half_float/cosine.cc b/src/turbo/avx512_fp16/half_float/cosine.cc index 863d3ead8..ab9f88171 100644 --- a/src/turbo/avx512_fp16/half_float/cosine.cc +++ b/src/turbo/avx512_fp16/half_float/cosine.cc @@ -43,7 +43,7 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, void cosine_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512FP16__) - + cosine_fp16_batch_avx512(vectors, query, n, dim, distances); #else (void)vectors; (void)query; diff --git a/src/turbo/avx512_fp16/half_float/inner_product.cc b/src/turbo/avx512_fp16/half_float/inner_product.cc index 3feccaab7..cba33b9a4 100644 --- a/src/turbo/avx512_fp16/half_float/inner_product.cc +++ b/src/turbo/avx512_fp16/half_float/inner_product.cc @@ -96,11 +96,15 @@ void inner_product_fp16_distance(const void *a, const void *b, size_t dim, void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { +#if defined(__AVX512FP16__) + inner_product_fp16_batch_avx512fp16(vectors, query, n, dim, distances); +#else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; +#endif // __AVX512FP16__ } } // namespace zvec::turbo::avx512_fp16 \ No newline at end of file diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean.cc b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc index d3fb56587..7e6962892 100644 --- a/src/turbo/avx512_fp16/half_float/squared_euclidean.cc +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc @@ -92,20 +92,21 @@ void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, (void)b; (void)dim; (void)distance; -#endif // __AVX512F__ +#endif // __AVX512FP16__ } void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512FP16__) + squared_euclidean_fp32_batch_avx512fp16(vectors, query, n, dim, distances); #else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; -#endif //__AVX512F__ +#endif //__AVX512FP16__ } } // namespace zvec::turbo::avx512_fp16 \ No newline at end of file diff --git a/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc index 09feca80b..e176ce7f2 100644 --- a/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc +++ b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc @@ -51,11 +51,15 @@ void inner_product_int8_distance(const void *a, const void *b, size_t dim, void inner_product_int8_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { +#if defined(__AVX512VNNI__) + inner_product_int8_batch_avx512_vnni(vectors, query, n, dim, distances); +#else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; +#endif // __AVX512VNNI__ } } // namespace zvec::turbo::avx512_vnni \ No newline at end of file diff --git a/src/turbo/scalar/float32/cosine.cc b/src/turbo/scalar/float32/cosine.cc index 21c7938d7..cffb0b166 100644 --- a/src/turbo/scalar/float32/cosine.cc +++ b/src/turbo/scalar/float32/cosine.cc @@ -29,6 +29,11 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, } void cosine_fp32_batch_distance(const void *const *vectors, const void *query, - size_t n, size_t dim, float *distances) {} + size_t n, size_t dim, float *distances) { + inner_product_fp32_batch_distance(vectors, query, n, dim, distances); + for (size_t i = 0; i < n; i++) { + distances[i] = 1 - distances[i]; + } +} } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float32/inner_product.cc b/src/turbo/scalar/float32/inner_product.cc index 65f63bb36..23a282ef3 100644 --- a/src/turbo/scalar/float32/inner_product.cc +++ b/src/turbo/scalar/float32/inner_product.cc @@ -34,6 +34,10 @@ void inner_product_fp32_distance(const void *a, const void *b, size_t dim, // Batch version of inner_product_fp32_distance. void inner_product_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, - float *distances) {} + float *distances) { + for (size_t i = 0; i < n; ++i) { + inner_product_fp32_distance(vectors[i], query, dim, &distances[i]); + } +} } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/float32/squared_euclidean.cc b/src/turbo/scalar/float32/squared_euclidean.cc index f69c42e4d..a3ffd10bb 100644 --- a/src/turbo/scalar/float32/squared_euclidean.cc +++ b/src/turbo/scalar/float32/squared_euclidean.cc @@ -32,6 +32,10 @@ void squared_euclidean_fp32_distance(const void *a, const void *b, size_t dim, void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, - size_t dim, float *distances) {} + size_t dim, float *distances) { + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp32_distance(vectors[i], query, dim, &distances[i]); + } +} } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/half_float/cosine.cc b/src/turbo/scalar/half_float/cosine.cc index 7c46eb0f5..3c7a39550 100644 --- a/src/turbo/scalar/half_float/cosine.cc +++ b/src/turbo/scalar/half_float/cosine.cc @@ -29,6 +29,10 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, } void cosine_fp16_batch_distance(const void *const *vectors, const void *query, - size_t n, size_t dim, float *distances) {} + size_t n, size_t dim, float *distances) { + for (size_t i = 0; i < n; ++i) { + cosine_fp16_distance(vectors[i], query, dim, &distances[i]); + } +} } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/half_float/inner_product.cc b/src/turbo/scalar/half_float/inner_product.cc index 93cb41ec1..d06c45b25 100644 --- a/src/turbo/scalar/half_float/inner_product.cc +++ b/src/turbo/scalar/half_float/inner_product.cc @@ -37,6 +37,10 @@ void inner_product_fp16_distance(const void *a, const void *b, size_t dim, // Batch version of inner_product_fp16_distance. void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, - float *distances) {} + float *distances) { + for (size_t i = 0; i < n; ++i) { + inner_product_fp16_distance(vectors[i], query, dim, &distances[i]); + } +} } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/half_float/squared_euclidean.cc b/src/turbo/scalar/half_float/squared_euclidean.cc index 0967ee01a..c3f6b3c2e 100644 --- a/src/turbo/scalar/half_float/squared_euclidean.cc +++ b/src/turbo/scalar/half_float/squared_euclidean.cc @@ -34,6 +34,10 @@ void squared_euclidean_fp16_distance(const void *a, const void *b, size_t dim, void squared_euclidean_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, - size_t dim, float *distances) {} + size_t dim, float *distances) { + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]); + } +} } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/cosine.cc b/src/turbo/scalar/record_quantized_int4/cosine.cc index b4c516fde..cab09202d 100644 --- a/src/turbo/scalar/record_quantized_int4/cosine.cc +++ b/src/turbo/scalar/record_quantized_int4/cosine.cc @@ -47,11 +47,9 @@ void cosine_int4_distance(const void *a, const void *b, size_t dim, void cosine_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { - (void)vectors; - (void)query; - (void)n; - (void)dim; - (void)distances; + for (size_t i = 0; i < n; ++i) { + cosine_int4_distance(vectors[i], query, dim, &distances[i]); + } } } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/inner_product.cc b/src/turbo/scalar/record_quantized_int4/inner_product.cc index 406b68976..02bdec849 100644 --- a/src/turbo/scalar/record_quantized_int4/inner_product.cc +++ b/src/turbo/scalar/record_quantized_int4/inner_product.cc @@ -51,11 +51,9 @@ void inner_product_int4_distance(const void *a, const void *b, size_t dim, void inner_product_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { - (void)vectors; - (void)query; - (void)n; - (void)dim; - (void)distances; + for (size_t i = 0; i < n; ++i) { + inner_product_int4_distance(vectors[i], query, dim, &distances[i]); + } } } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc index 0feb7eae1..555f96246 100644 --- a/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc +++ b/src/turbo/scalar/record_quantized_int4/squared_euclidean.cc @@ -53,11 +53,9 @@ void squared_euclidean_int4_distance(const void *a, const void *b, size_t dim, void squared_euclidean_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { - (void)vectors; - (void)query; - (void)n; - (void)dim; - (void)distances; + for (size_t i = 0; i < n; ++i) { + squared_euclidean_int4_distance(vectors[i], query, dim, &distances[i]); + } } } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/cosine.cc b/src/turbo/scalar/record_quantized_int8/cosine.cc index a18403f3e..fe5faf8e7 100644 --- a/src/turbo/scalar/record_quantized_int8/cosine.cc +++ b/src/turbo/scalar/record_quantized_int8/cosine.cc @@ -48,11 +48,9 @@ void cosine_int8_distance(const void *a, const void *b, size_t dim, void cosine_int8_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { - (void)vectors; - (void)query; - (void)n; - (void)dim; - (void)distances; + for (size_t i = 0; i < n; ++i) { + cosine_int8_distance(vectors[i], query, dim, &distances[i]); + } } } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/inner_product.cc b/src/turbo/scalar/record_quantized_int8/inner_product.cc index 115ab2992..e33cdac12 100644 --- a/src/turbo/scalar/record_quantized_int8/inner_product.cc +++ b/src/turbo/scalar/record_quantized_int8/inner_product.cc @@ -53,11 +53,9 @@ void inner_product_int8_distance(const void *a, const void *b, size_t dim, void inner_product_int8_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { - (void)vectors; - (void)query; - (void)n; - (void)dim; - (void)distances; + for (size_t i = 0; i < n; ++i) { + inner_product_int8_distance(vectors[i], query, dim, &distances[i]); + } } } // namespace zvec::turbo::scalar \ No newline at end of file diff --git a/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc index 4da173c33..d05d1a049 100644 --- a/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc +++ b/src/turbo/scalar/record_quantized_int8/squared_euclidean.cc @@ -53,11 +53,9 @@ void squared_euclidean_int8_distance(const void *a, const void *b, size_t dim, void squared_euclidean_int8_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { - (void)vectors; - (void)query; - (void)n; - (void)dim; - (void)distances; + for (size_t i = 0; i < n; ++i) { + squared_euclidean_int8_distance(vectors[i], query, dim, &distances[i]); + } } } // namespace zvec::turbo::scalar \ No newline at end of file From 41efb292648c2482f26fde9a17fc42332531fd06 Mon Sep 17 00:00:00 2001 From: ray Date: Wed, 15 Apr 2026 13:54:27 +0800 Subject: [PATCH 42/44] fix: fix batch dist --- src/turbo/armv8/half_float/cosine.cc | 10 ++ .../armv8/half_float/inner_product_common.h | 82 ++++++++++- .../half_float/squared_euclidean_common.h | 92 +++++++++++-- src/turbo/avx/float32/common.h | 128 ++++++++++++++++++ src/turbo/avx/float32/cosine.cc | 6 +- src/turbo/avx/float32/squared_euclidean.cc | 4 +- src/turbo/avx/half_float/cosine.cc | 13 +- src/turbo/avx/half_float/inner_product.cc | 4 +- src/turbo/avx/half_float/squared_euclidean.cc | 4 +- .../record_quantized_int4/inner_product.cc | 2 +- src/turbo/avx512/float32/cosine.cc | 13 +- src/turbo/avx512/float32/inner_product.cc | 6 +- src/turbo/avx512/float32/squared_euclidean.cc | 4 +- src/turbo/avx512/half_float/cosine.cc | 10 ++ src/turbo/avx512/half_float/inner_product.cc | 4 +- .../avx512/half_float/squared_euclidean.cc | 4 +- src/turbo/avx512_fp16/half_float/cosine.cc | 12 +- .../avx512_fp16/half_float/inner_product.cc | 4 +- .../half_float/squared_euclidean.cc | 4 +- .../record_quantized_int8/inner_product.cc | 2 +- 20 files changed, 380 insertions(+), 28 deletions(-) diff --git a/src/turbo/armv8/half_float/cosine.cc b/src/turbo/armv8/half_float/cosine.cc index 91792b03f..baf39c702 100644 --- a/src/turbo/armv8/half_float/cosine.cc +++ b/src/turbo/armv8/half_float/cosine.cc @@ -39,7 +39,17 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, void cosine_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__ARM_NEON) + constexpr size_t extra_dim = 2; + const int original_dim = dim - extra_dim; + if (original_dim <= 0) { + return; + } + + inner_product_fp16_batch_armv8(vectors, query, n, original_dim, distances); + for (size_t i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } #else (void)vectors; (void)query; diff --git a/src/turbo/armv8/half_float/inner_product_common.h b/src/turbo/armv8/half_float/inner_product_common.h index 1ac007d07..54c3072ff 100644 --- a/src/turbo/armv8/half_float/inner_product_common.h +++ b/src/turbo/armv8/half_float/inner_product_common.h @@ -36,7 +36,8 @@ namespace zvec::turbo::armv8::internal { #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) //! NEON fused multiply-add for inner product (FP16) -#define ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum) v_sum = vfmaq_f16(v_sum, v_m, v_q); +#define ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum) \ + v_sum = vfmaq_f16(v_sum, v_m, v_q); //! Iterative process of computing distance (FP16, M=1, N=1) #define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC) \ @@ -82,7 +83,8 @@ namespace zvec::turbo::armv8::internal { #else //! NEON fused multiply-add for inner product (FP32) -#define ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum) v_sum = vfmaq_f32(v_sum, v_m, v_q); +#define ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum) \ + v_sum = vfmaq_f32(v_sum, v_m, v_q); //! Iterative process of computing distance (FP16, M=1, N=1) #define MATRIX_FP16_ITER_1X1_NEON(m, q, _RES, _PROC) \ @@ -127,6 +129,82 @@ namespace zvec::turbo::armv8::internal { #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template +static __attribute__((always_inline)) void inner_product_fp16_batch_armv8_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vdupq_n_f32(0); + } + + size_t dim = 0; + for (; dim + 64 <= dimensionality; dim += 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32( + v_sum[i], vld1q_f32(reinterpret_cast(query) + dim), + vld1q_f32(reinterpret_cast(vectors[i]) + dim)); + } + } + + if (dim >= dimensionality + 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast(query)+dim), vld1q_f32(reinterpret_cast(vectors[i])+dim))); + } + + dim += 4; + } + + for (size_t i = 0; i < batch_size; ++i) { + float result = vaddvq_f32(v_sum[i]); + switch (last - lhs) { + case 3: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 2], + reinterpret_cast(vectors[i])[dim + 2], + result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 1], + reinterpret_cast(vectors[i])[dim + 1], + result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 0], + reinterpret_cast(vectors[i])[dim + 0], + result) + } + + distances[i] = -result; + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void inner_product_fp16_batch_armv8( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_fp16_batch_armv8_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_fp16_batch_armv8_impl<1>(query, &vectors[i], prefetch_ptrs, + dim, distances + i); + } +} + } // namespace zvec::turbo::armv8::internal #endif // defined(__ARM_NEON) diff --git a/src/turbo/armv8/half_float/squared_euclidean_common.h b/src/turbo/armv8/half_float/squared_euclidean_common.h index 382c58994..df3807e61 100644 --- a/src/turbo/armv8/half_float/squared_euclidean_common.h +++ b/src/turbo/armv8/half_float/squared_euclidean_common.h @@ -40,10 +40,10 @@ namespace zvec::turbo::armv8::internal { #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) //! NEON sum of squared difference (FP16) -#define ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum) \ - { \ - float16x8_t v_d = vsubq_f16(v_m, v_q); \ - v_sum = vfmaq_f16(v_sum, v_d, v_d); \ +#define ACCUM_FP16_STEP_NEON(v_m, v_q, v_sum) \ + { \ + float16x8_t v_d = vsubq_f16(v_m, v_q); \ + v_sum = vfmaq_f16(v_sum, v_d, v_d); \ } //! Iterative process of computing distance (FP16, M=1, N=1) @@ -89,10 +89,10 @@ namespace zvec::turbo::armv8::internal { #else //! NEON sum of squared difference (FP32) -#define ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum) \ - { \ - float32x4_t v_d = vsubq_f32(v_m, v_q); \ - v_sum = vfmaq_f32(v_sum, v_d, v_d); \ +#define ACCUM_FP32_STEP_NEON(v_m, v_q, v_sum) \ + { \ + float32x4_t v_d = vsubq_f32(v_m, v_q); \ + v_sum = vfmaq_f32(v_sum, v_d, v_d); \ } //! Iterative process of computing distance (FP16, M=1, N=1) @@ -138,6 +138,82 @@ namespace zvec::turbo::armv8::internal { #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template +static __attribute__((always_inline)) void +squared_euclidean_fp16_batch_armv8_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + size_t dimensionality, float *distances) { + float32x4_t v_sum[batch_size] for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vdupq_n_f32(0); + } + + size_t dim = 0; + for (; dim + 64 <= dimensionality; dim += 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32( + v_sum[i], vld1q_f32(reinterpret_cast(query) + dim), + vld1q_f32(reinterpret_cast(vectors[i]) + dim)); + } + } + + if (dim >= dimensionality + 4) { + for (size_t i = 0; i < batch_size; ++i) { + v_sum[i] = vfmaq_f32(v_sum[i], vld1q_f32(reinterpret_cast(query)+dim), vld1q_f32(reinterpret_cast(vectors[i])+dim))); + } + + dim += 4; + } + + for (size_t i = 0; i < batch_size; ++i) { + float result = vaddvq_f32(v_sum[i]); + switch (last - lhs) { + case 3: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 2], + reinterpret_cast(vectors[i])[dim + 2], + result) + /* FALLTHRU */ + case 2: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 1], + reinterpret_cast(vectors[i])[dim + 1], + result) + /* FALLTHRU */ + case 1: + FMA_FP32_GENERAL(reinterpret_cast(query)[dim + 0], + reinterpret_cast(vectors[i])[dim + 0], + result) + } + + distances[i] = -result; + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void squared_euclidean_fp16_batch_armv8( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + squared_euclidean_fp16_batch_armv8_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + squared_euclidean_fp16_batch_armv8_impl<1>( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } +} } // namespace zvec::turbo::armv8::internal #endif // defined(__ARM_NEON) diff --git a/src/turbo/avx/float32/common.h b/src/turbo/avx/float32/common.h index cb22033cc..acd06f0de 100644 --- a/src/turbo/avx/float32/common.h +++ b/src/turbo/avx/float32/common.h @@ -17,6 +17,9 @@ #if defined(__AVX__) #include +#include +#include +#include #define SSD_FP32_GENERAL(m, q, sum) \ { \ @@ -35,4 +38,129 @@ static inline float HorizontalAdd_FP32_V256(__m256 v) { return _mm_cvtss_f32(x4); } +static inline float sum4(__m128 v) { + v = _mm_add_ps(v, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 8))); + return _mm_cvtss_f32(v) + _mm_cvtss_f32(_mm_shuffle_ps(v, v, 1)); +} + +static inline __m128 sum_top_bottom_avx(__m256 v) { + const __m128 high = _mm256_extractf128_ps(v, 1); + const __m128 low = _mm256_castps256_ps128(v); + return _mm_add_ps(high, low); +} + + +template +static std::enable_if_t, void> +inner_product_fp32_batch_avx_impl( + const ValueType *query, const ValueType *const *ptrs, + std::array &prefetch_ptrs, + size_t dimensionality, float *results) { + __m256 accs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + accs[i] = _mm256_setzero_ps(); + } + size_t dim = 0; + for (; dim + 8 <= dimensionality; dim += 8) { + __m256 q = _mm256_loadu_ps(query + dim); + + __m256 data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm256_loadu_ps(ptrs[i] + dim); + } + if (prefetch_ptrs[0]) { + for (size_t i = 0; i < dp_batch; ++i) { + ailego_prefetch(prefetch_ptrs[i] + dim); + } + } + for (size_t i = 0; i < dp_batch; ++i) { + accs[i] = _mm256_fnmadd_ps(q, data_regs[i], accs[i]); + } + } + + __m128 sum128_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + sum128_regs[i] = sum_top_bottom_avx(accs[i]); + } + if (dim + 4 <= dimensionality) { + __m128 q = _mm_loadu_ps(query + dim); + + __m128 data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm_loadu_ps(ptrs[i] + dim); + } + if (prefetch_ptrs[0]) { + for (size_t i = 0; i < dp_batch; ++i) { + ailego_prefetch(prefetch_ptrs[i] + dim); + } + } + for (size_t i = 0; i < dp_batch; ++i) { + sum128_regs[i] = _mm_fnmadd_ps(q, data_regs[i], sum128_regs[i]); + } + dim += 4; + } + if (dim + 2 <= dimensionality) { + __m128 q = _mm_setzero_ps(); + + __m128 data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm_setzero_ps(); + } + + q = _mm_loadh_pi(q, (const __m64 *)(query + dim)); + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm_loadh_pi(data_regs[i], (const __m64 *)(ptrs[i] + dim)); + } + for (size_t i = 0; i < dp_batch; ++i) { + sum128_regs[i] = _mm_fnmadd_ps(q, data_regs[i], sum128_regs[i]); + } + dim += 2; + } + + float res[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + res[i] = sum4(sum128_regs[i]); + } + if (dim < dimensionality) { + float q = query[dim]; + for (size_t i = 0; i < dp_batch; ++i) { + res[i] -= q * ptrs[i][dim]; + } + } + for (size_t i = 0; i < dp_batch; ++i) { + results[i] = -res[i]; + } +} + +// Dispatch batched inner product over all `n` vectors with prefetching. +static __attribute__((always_inline)) void inner_product_fp32_batch_avx( + const void *const *vectors, const void *query, size_t n, size_t dim, + float *distances) { + static constexpr size_t batch_size = 2; + static constexpr size_t prefetch_step = 2; + const float *typed_query = reinterpret_cast(query); + size_t i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (size_t j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step < n) { + prefetch_ptrs[j] = reinterpret_cast( + vectors[i + j + batch_size * prefetch_step]); + } else { + prefetch_ptrs[j] = nullptr; + } + } + inner_product_fp32_batch_avx_impl( + typed_query, reinterpret_cast(&vectors[i]), + prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + inner_product_fp32_batch_avx_impl( + typed_query, reinterpret_cast(&vectors[i]), + prefetch_ptrs, dim, distances + i); + } +} + + #endif \ No newline at end of file diff --git a/src/turbo/avx/float32/cosine.cc b/src/turbo/avx/float32/cosine.cc index 488fadc20..d2f94f4bf 100644 --- a/src/turbo/avx/float32/cosine.cc +++ b/src/turbo/avx/float32/cosine.cc @@ -43,13 +43,13 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, void cosine_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX__) - const int original_dim = dim - 1; + constexpr size_t extra_dim = 1; + const int original_dim = dim - extra_dim; if (original_dim <= 0) { return; } - internal::inner_product_fp32_batch_avx(vectors, query, n, original_dim, - distances); + inner_product_fp32_batch_distance(vectors, query, n, original_dim, distances); for (int i = 0; i < n; ++i) { distances[i] = 1 - distances[i]; diff --git a/src/turbo/avx/float32/squared_euclidean.cc b/src/turbo/avx/float32/squared_euclidean.cc index 19e81abb0..9240ea7e9 100644 --- a/src/turbo/avx/float32/squared_euclidean.cc +++ b/src/turbo/avx/float32/squared_euclidean.cc @@ -106,7 +106,9 @@ void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX__) - squared_euclidean_fp32_batch_avx(vectors, query, n, dim, distances); + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp32_distance(vectors[i], query, dim, &distances[i]); + } #else (void)vectors; (void)distances; diff --git a/src/turbo/avx/half_float/cosine.cc b/src/turbo/avx/half_float/cosine.cc index af68a7d8a..27a3c7dbd 100644 --- a/src/turbo/avx/half_float/cosine.cc +++ b/src/turbo/avx/half_float/cosine.cc @@ -43,7 +43,18 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, void cosine_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX__) - cosine_fp16_batch_avx(vectors, query, n, dim, distances); + constexpr size_t extra_dim = 2; + const int original_dim = dim - extra_dim; + if (original_dim <= 0) { + return; + } + + inner_product_fp16_batch_distance(vectors, query, n, original_dim, distances); + + for (size_t i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } + #else (void)vectors; (void)query; diff --git a/src/turbo/avx/half_float/inner_product.cc b/src/turbo/avx/half_float/inner_product.cc index 44a72dbaa..4ac05de2a 100644 --- a/src/turbo/avx/half_float/inner_product.cc +++ b/src/turbo/avx/half_float/inner_product.cc @@ -43,7 +43,9 @@ void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX__) - inner_product_fp16_batch_avx(vectors, query, n, dim, distances); + for (size_t i = 0; i < n; ++i) { + inner_product_fp16_distance(vectors[i], query, dim, &distances[i]); + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx/half_float/squared_euclidean.cc b/src/turbo/avx/half_float/squared_euclidean.cc index 222ec1176..24913891c 100644 --- a/src/turbo/avx/half_float/squared_euclidean.cc +++ b/src/turbo/avx/half_float/squared_euclidean.cc @@ -40,7 +40,9 @@ void squared_euclidean_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX__) - squared_euclidean_fp16_batch_avx(vectors, query, n, dim, distances); + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]); + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx2/record_quantized_int4/inner_product.cc b/src/turbo/avx2/record_quantized_int4/inner_product.cc index 4db9e7e61..e70cf2ed1 100644 --- a/src/turbo/avx2/record_quantized_int4/inner_product.cc +++ b/src/turbo/avx2/record_quantized_int4/inner_product.cc @@ -63,7 +63,7 @@ void inner_product_int4_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX2__) - inner_product_int4_batch_avx2(vectors, query, n, dim, distances); + internal::inner_product_int4_batch_avx2(vectors, query, n, dim, distances); #else (void)vectors; (void)query; diff --git a/src/turbo/avx512/float32/cosine.cc b/src/turbo/avx512/float32/cosine.cc index 55c48c7bf..3fff482c4 100644 --- a/src/turbo/avx512/float32/cosine.cc +++ b/src/turbo/avx512/float32/cosine.cc @@ -43,7 +43,18 @@ void cosine_fp32_distance(const void *a, const void *b, size_t dim, void cosine_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512F__) - cosine_fp32_batch_avx512(vectors, query, n, dim, distances); + // `dim` is the full encoded size; the original vector occupies dim-24 bytes. + const int original_dim = dim - 1; + if (original_dim <= 0) { + return; + } + + inner_product_fp32_batch_distance(vectors, query, n, original_dim, distances); + + for (size_t i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } + #else (void)vectors; (void)query; diff --git a/src/turbo/avx512/float32/inner_product.cc b/src/turbo/avx512/float32/inner_product.cc index 0055d5911..b28ef2e6a 100644 --- a/src/turbo/avx512/float32/inner_product.cc +++ b/src/turbo/avx512/float32/inner_product.cc @@ -89,14 +89,16 @@ void inner_product_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512F__) - + for (size_t i = 0; i < n; ++i) { + inner_product_fp32_distance(vectors[i], query, dim, &distances[i]); + } #else (void)vectors; (void)query; (void)n; (void)dim; (void)distances; -#endif //__AVX2__ +#endif //__AVX512F__ } } // namespace zvec::turbo::avx512 \ No newline at end of file diff --git a/src/turbo/avx512/float32/squared_euclidean.cc b/src/turbo/avx512/float32/squared_euclidean.cc index 03e0120d6..cc00cacf9 100644 --- a/src/turbo/avx512/float32/squared_euclidean.cc +++ b/src/turbo/avx512/float32/squared_euclidean.cc @@ -90,7 +90,9 @@ void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512F__) - squared_euclidean_fp32_batch_avx512(vectors, query, n, dim, distances); + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp32_distance(vectors[i], query, dim, &distances[i]); + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx512/half_float/cosine.cc b/src/turbo/avx512/half_float/cosine.cc index d123197f9..bf08eb744 100644 --- a/src/turbo/avx512/half_float/cosine.cc +++ b/src/turbo/avx512/half_float/cosine.cc @@ -43,7 +43,17 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, void cosine_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512F__) + constexpr size_t extra_dim = 2; + const size_t original_dim = dim - extra_dim; + if (original_dim <= 0) { + return; + } + + inner_product_fp16_batch_distance(vectors, query, n, original_dim, distances); + for (size_t i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx512/half_float/inner_product.cc b/src/turbo/avx512/half_float/inner_product.cc index 058b522a9..221d0a2ab 100644 --- a/src/turbo/avx512/half_float/inner_product.cc +++ b/src/turbo/avx512/half_float/inner_product.cc @@ -44,7 +44,9 @@ void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512F__) - inner_product_fp16_batch_avx512(vectors, query, n, dim, distances); + for (size_t i = 0; i < n; ++i) { + inner_product_fp16_distance(vectors[i], query, dim, &distances[i]); + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx512/half_float/squared_euclidean.cc b/src/turbo/avx512/half_float/squared_euclidean.cc index 0569b4d6c..7a4b18e11 100644 --- a/src/turbo/avx512/half_float/squared_euclidean.cc +++ b/src/turbo/avx512/half_float/squared_euclidean.cc @@ -46,7 +46,9 @@ void squared_euclidean_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512F__) - squared_euclidean_fp16_batch_avx512(vectors, query, n, dim, distances); + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]); + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx512_fp16/half_float/cosine.cc b/src/turbo/avx512_fp16/half_float/cosine.cc index ab9f88171..a5404712a 100644 --- a/src/turbo/avx512_fp16/half_float/cosine.cc +++ b/src/turbo/avx512_fp16/half_float/cosine.cc @@ -43,7 +43,17 @@ void cosine_fp16_distance(const void *a, const void *b, size_t dim, void cosine_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512FP16__) - cosine_fp16_batch_avx512(vectors, query, n, dim, distances); + constexpr size_t extra_dim = 2; + const size_t original_dim = dim - extra_dim; + if (original_dim <= 0) { + return; + } + + inner_product_fp16_batch_distance(vectors, query, n, original_dim, distances); + + for (size_t i = 0; i < n; ++i) { + distances[i] = 1 - distances[i]; + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx512_fp16/half_float/inner_product.cc b/src/turbo/avx512_fp16/half_float/inner_product.cc index cba33b9a4..c7262577d 100644 --- a/src/turbo/avx512_fp16/half_float/inner_product.cc +++ b/src/turbo/avx512_fp16/half_float/inner_product.cc @@ -97,7 +97,9 @@ void inner_product_fp16_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512FP16__) - inner_product_fp16_batch_avx512fp16(vectors, query, n, dim, distances); + for (size_t i = 0; i < n; ++i) { + inner_product_fp16_distance(vectors[i], query, dim, &distances[i]); + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx512_fp16/half_float/squared_euclidean.cc b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc index 7e6962892..5e33255b3 100644 --- a/src/turbo/avx512_fp16/half_float/squared_euclidean.cc +++ b/src/turbo/avx512_fp16/half_float/squared_euclidean.cc @@ -99,7 +99,9 @@ void squared_euclidean_fp32_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512FP16__) - squared_euclidean_fp32_batch_avx512fp16(vectors, query, n, dim, distances); + for (size_t i = 0; i < n; ++i) { + squared_euclidean_fp16_distance(vectors[i], query, dim, &distances[i]); + } #else (void)vectors; (void)query; diff --git a/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc index e176ce7f2..db83b128a 100644 --- a/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc +++ b/src/turbo/avx512_vnni/record_quantized_int8/inner_product.cc @@ -52,7 +52,7 @@ void inner_product_int8_batch_distance(const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { #if defined(__AVX512VNNI__) - inner_product_int8_batch_avx512_vnni(vectors, query, n, dim, distances); + internal::ip_int8_batch_avx512_vnni(vectors, query, n, dim, distances); #else (void)vectors; (void)query; From 1d02de35b5f480992ef809dd1ecf5155621bada1 Mon Sep 17 00:00:00 2001 From: ray Date: Thu, 16 Apr 2026 21:01:09 +0800 Subject: [PATCH 43/44] feat: add quantizer --- src/core/metric/quantized_integer_metric.cc | 34 +-- src/include/zvec/core/framework/index_meta.h | 13 +- .../zvec/core/framework/index_metric.h | 3 + src/include/zvec/turbo/turbo.h | 7 + .../core/algorithm/hnsw/hnsw_streamer_test.cc | 278 ++++++------------ 5 files changed, 127 insertions(+), 208 deletions(-) diff --git a/src/core/metric/quantized_integer_metric.cc b/src/core/metric/quantized_integer_metric.cc index b0fc95995..bbb2e587d 100644 --- a/src/core/metric/quantized_integer_metric.cc +++ b/src/core/metric/quantized_integer_metric.cc @@ -96,18 +96,18 @@ class QuantizedIntegerMetric : public IndexMetric { switch (origin_metric_type_) { case MetricType::kSquaredEuclidean: if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { - auto turbo_ret = turbo::get_distance_func( - turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, - turbo::QuantizeType::kDefault); + auto turbo_ret = + turbo::get_distance_func(turbo::MetricType::kSquaredEuclidean, + turbo::DataType::kInt8, quantize_type_); if (turbo_ret && m == 1 && n == 1) { return turbo_ret; } return DistanceMatrixCompute(m, n); } if (meta_.data_type() == IndexMeta::DataType::DT_INT4) { - auto turbo_ret = turbo::get_distance_func( - turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt4, - turbo::QuantizeType::kDefault); + auto turbo_ret = + turbo::get_distance_func(turbo::MetricType::kSquaredEuclidean, + turbo::DataType::kInt4, quantize_type_); if (turbo_ret && m == 1 && n == 1) { return turbo_ret; } @@ -118,9 +118,9 @@ class QuantizedIntegerMetric : public IndexMetric { case MetricType::kInnerProduct: if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { - auto turbo_ret = turbo::get_distance_func( - turbo::MetricType::kInnerProduct, turbo::DataType::kInt8, - turbo::QuantizeType::kDefault); + auto turbo_ret = + turbo::get_distance_func(turbo::MetricType::kInnerProduct, + turbo::DataType::kInt8, quantize_type_); if (turbo_ret && m == 1 && n == 1) { return turbo_ret; } @@ -128,9 +128,9 @@ class QuantizedIntegerMetric : public IndexMetric { } if (meta_.data_type() == IndexMeta::DataType::DT_INT4) { - auto turbo_ret = turbo::get_distance_func( - turbo::MetricType::kInnerProduct, turbo::DataType::kInt4, - turbo::QuantizeType::kDefault); + auto turbo_ret = + turbo::get_distance_func(turbo::MetricType::kInnerProduct, + turbo::DataType::kInt4, quantize_type_); if (turbo_ret && m == 1 && n == 1) { return turbo_ret; } @@ -157,9 +157,9 @@ class QuantizedIntegerMetric : public IndexMetric { break; case MetricType::kCosine: if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { - auto turbo_ret = turbo::get_distance_func( - turbo::MetricType::kCosine, turbo::DataType::kInt8, - turbo::QuantizeType::kDefault); + auto turbo_ret = + turbo::get_distance_func(turbo::MetricType::kCosine, + turbo::DataType::kInt8, quantize_type_); if (turbo_ret) { return turbo_ret; } @@ -180,7 +180,7 @@ class QuantizedIntegerMetric : public IndexMetric { if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { auto turbo_ret = turbo::get_batch_distance_func( turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, - turbo::QuantizeType::kDefault); + quantize_type_); if (turbo_ret) { return turbo_ret; } @@ -235,7 +235,7 @@ class QuantizedIntegerMetric : public IndexMetric { if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { auto turbo_ret = turbo::get_batch_distance_func( turbo::MetricType::kCosine, turbo::DataType::kInt8, - turbo::QuantizeType::kDefault); + quantize_type_); if (turbo_ret) { return turbo_ret; } diff --git a/src/include/zvec/core/framework/index_meta.h b/src/include/zvec/core/framework/index_meta.h index 451e14059..a11af00f4 100644 --- a/src/include/zvec/core/framework/index_meta.h +++ b/src/include/zvec/core/framework/index_meta.h @@ -38,18 +38,9 @@ class IndexMeta { DT_INT4 = 6, DT_BINARY32 = 7, DT_BINARY64 = 8, - - // new data type for turboss - // DT_ZVEC_FP16_ = 11, - // DT_ZVEC_FP32 = 12, - // DT_ZVEC_FP64 = 13, - // DT_ZVEC_INT8 = 14, - // DT_ZVEC_INT16 = 15, - // DT_ZVEC_INT4 = 16, - // DT_ZVEC_BINARY32 = 7, - // DT_ZVEC_BINARY64 = 8, }; + /*! Major Orders */ enum MajorOrder { @@ -719,6 +710,8 @@ class IndexQueryMeta { uint32_t dimension_{0}; uint32_t unit_size_{0}; uint32_t element_size_{0}; + uint32_t extra_meta_size_{0}; + uint32_t quantize_type_{0}; }; } // namespace core diff --git a/src/include/zvec/core/framework/index_metric.h b/src/include/zvec/core/framework/index_metric.h index 24d772362..eeb54099f 100644 --- a/src/include/zvec/core/framework/index_metric.h +++ b/src/include/zvec/core/framework/index_metric.h @@ -137,6 +137,9 @@ struct IndexMetric : public IndexModule { virtual DistanceBatchQueryPreprocessFunc get_query_preprocess_func() const { return nullptr; } + + private: + int quantize_type_{0}; }; } // namespace core diff --git a/src/include/zvec/turbo/turbo.h b/src/include/zvec/turbo/turbo.h index 70ddabd6d..f07ace8c6 100644 --- a/src/include/zvec/turbo/turbo.h +++ b/src/include/zvec/turbo/turbo.h @@ -43,6 +43,13 @@ enum class DataType { enum class QuantizeType { kDefault, + kRecordInt8, + kRecordInt4, + kInt8, + kInt4, + kFp16, + kPQ, + kRabit }; enum class CpuArchType { diff --git a/tests/core/algorithm/hnsw/hnsw_streamer_test.cc b/tests/core/algorithm/hnsw/hnsw_streamer_test.cc index 3f27f5252..1ee7ef6d1 100644 --- a/tests/core/algorithm/hnsw/hnsw_streamer_test.cc +++ b/tests/core/algorithm/hnsw/hnsw_streamer_test.cc @@ -3471,93 +3471,6 @@ TEST_F(HnswStreamerTest, TestGroupInBruteforceSearch) { } } -#if 0 -TEST_F(HnswStreamerTest, TestBinaryConverter) { - uint32_t dimension = 2560; - - IndexStreamer::Pointer streamer = - IndexFactory::CreateStreamer("HnswStreamer"); - ASSERT_TRUE(streamer != nullptr); - - ailego::Params params; - // params.set(PARAM_HNSW_STREAMER_MAX_NEIGHBOR_COUNT, 10); - // params.set(PARAM_HNSW_STREAMER_SCALING_FACTOR, 16); - // params.set(PARAM_HNSW_STREAMER_EFCONSTRUCTION, 10); - // params.set(PARAM_HNSW_STREAMER_EF, 5); - params.set(PARAM_HNSW_STREAMER_BRUTE_FORCE_THRESHOLD, 1000U); - - ailego::Params stg_params; - - IndexMeta index_meta_raw(IndexMeta::DataType::DT_FP32, dimension); - index_meta_raw.set_metric("InnerProduct", 0, ailego::Params()); - - ailego::Params converter_params; - auto converter = IndexFactory::CreateConverter("BinaryConverter"); - ASSERT_TRUE(converter != nullptr); - - converter->init(index_meta_raw, converter_params); - - IndexMeta index_meta = converter->meta(); - - auto reformer = IndexFactory::CreateReformer(index_meta.reformer_name()); - ASSERT_TRUE(reformer != nullptr); - - ASSERT_EQ(0, reformer->init(index_meta.reformer_params())); - - auto storage = IndexFactory::CreateStorage("MMapFileStorage"); - ASSERT_EQ(0, storage->init(stg_params)); - ASSERT_EQ(0, storage->open(dir_ + "TestBinaryConverter.index", true)); - ASSERT_EQ(0, streamer->init(index_meta, params)); - ASSERT_EQ(0, streamer->open(storage)); - - size_t cnt = 5000U; - auto ctx = streamer->create_context(); - ASSERT_TRUE(!!ctx); - - IndexQueryMeta qmeta(IndexMeta::DataType::DT_FP32, dimension); - - std::random_device rd; - std::mt19937 gen(rd()); - - std::uniform_real_distribution dist(-2.0, 2.0); - std::vector> vecs; - - for (size_t i = 0; i < cnt; i++) { - NumericalVector vec(dimension); - for (size_t j = 0; j < dimension; ++j) { - vec[j] = dist(gen); - } - - std::string new_vec; - IndexQueryMeta new_meta; - - ASSERT_EQ(0, reformer->convert(vec.data(), qmeta, &new_vec, &new_meta)); - ASSERT_EQ(0, streamer->add_impl(i, new_vec.data(), new_meta, ctx)); - - vecs.push_back(vec); - } - - size_t query_cnt = 200U; - auto knnCtx = streamer->create_context(); - - float epison = 1e-6; - for (size_t i = 0; i < query_cnt; i++) { - auto &vec = vecs[i]; - std::string new_query; - IndexQueryMeta new_meta; - ASSERT_EQ(0, reformer->transform(vec.data(), qmeta, &new_query, &new_meta)); - - size_t topk = 50; - knnCtx->set_topk(topk); - ASSERT_EQ(0, streamer->search_impl(new_query.data(), new_meta, knnCtx)); - auto &results = knnCtx->result(); - ASSERT_EQ(topk, results.size()); - ASSERT_EQ(i, results[0].key()); - ASSERT_NEAR(0, results[0].score(), epison); - } -} -#endif - TEST_F(HnswStreamerTest, TestAddAndSearchWithID) { IndexStreamer::Pointer streamer = IndexFactory::CreateStreamer("HnswStreamer"); @@ -3671,131 +3584,134 @@ TEST_F(HnswStreamerTest, TestAddAndSearchWithID) { // EXPECT_GT(cost, 2.0f); } -#if 0 -TEST_F(HnswStreamerTest, TestBasicRefiner) { - uint32_t dimension = 1120; - - IndexStreamer::Pointer base_streamer = +TEST_F(HnswStreamerTest, TestTurboCosineInt8Quantizer) { + IndexStreamer::Pointer streamer = IndexFactory::CreateStreamer("HnswStreamer"); - ASSERT_TRUE(base_streamer != nullptr); + ASSERT_TRUE(streamer != nullptr); - IndexStreamer::Pointer refine_streamer = - IndexFactory::CreateStreamer("FlatStreamer"); - ASSERT_TRUE(refine_streamer != nullptr); + ailego::Params params; + params.set(PARAM_HNSW_STREAMER_MAX_NEIGHBOR_COUNT, 50); + params.set(PARAM_HNSW_STREAMER_SCALING_FACTOR, 16); + params.set(PARAM_HNSW_STREAMER_EFCONSTRUCTION, 100); + params.set(PARAM_HNSW_STREAMER_EF, 100); + params.set(PARAM_HNSW_STREAMER_BRUTE_FORCE_THRESHOLD, 1000U); + params.set(PARAM_HNSW_STREAMER_GET_VECTOR_ENABLE, true); - IndexRefiner::Pointer refiner = IndexFactory::CreateRefiner("BasicRefiner"); - ASSERT_TRUE(refiner != nullptr); + ailego::Params stg_params; - ailego::Params params; - IndexMeta index_meta(IndexMeta::DataType::DT_FP32, dimension); - index_meta.set_metric("InnerProduct", 0, ailego::Params()); + IndexMeta index_meta_raw(IndexMeta::DataType::DT_FP32, dim); + index_meta_raw.set_metric("Cosine", 0, ailego::Params()); ailego::Params converter_params; - auto converter = IndexFactory::CreateConverter("BinaryConverter"); - ASSERT_TRUE(converter != nullptr); + auto quantizer = IndexFactory::CreateQuantier("Int8Quantizer"); + ASSERT_TRUE(quantizer != nullptr); - converter->init(index_meta, converter_params); + quantizer->init(index_meta_raw, quantizer_params); - IndexMeta index_meta_binary = converter->meta(); + IndexMeta index_meta = quantizer->meta(); - auto reformer = - IndexFactory::CreateReformer(index_meta_binary.reformer_name()); - ASSERT_TRUE(reformer != nullptr); + auto storage = IndexFactory::CreateStorage("MMapFileStorage"); + ASSERT_EQ(0, storage->init(stg_params)); + ASSERT_EQ(0, + storage->open(dir_ + "TestTurboCosineInt8Quantizer.index", true)); + ASSERT_EQ(0, streamer->init(index_meta, params)); + ASSERT_EQ(0, streamer->open(storage)); - ASSERT_EQ(0, reformer->init(index_meta_binary.reformer_params())); + NumericalVector vec(dim); + size_t cnt = 2000U; + auto ctx = streamer->create_context(); + ASSERT_TRUE(!!ctx); - // base streamer - ailego::Params base_stg_params; - auto base_storage = IndexFactory::CreateStorage("MMapFileStorage"); - ASSERT_EQ(0, base_storage->init(base_stg_params)); - ASSERT_EQ(0, base_storage->open(dir_ + "TestBasicRefinerBase.index", true)); - ASSERT_EQ(0, base_streamer->init(index_meta_binary, params)); - ASSERT_EQ(0, base_streamer->open(base_storage)); + IndexQueryMeta qmeta(IndexMeta::DataType::DT_FP32, dim); + IndexQueryMeta new_meta; - auto base_ctx = base_streamer->create_context(); - ASSERT_TRUE(!!base_ctx); + const float epsilon = 1e-2; + float fixed_value = float(cnt) / 2; + for (size_t i = 0; i < cnt; i++) { + float add_on = i * 10; + for (size_t j = 0; j < dim; ++j) { + if (j < dim / 4) + vec[j] = fixed_value; + else + vec[j] = fixed_value + add_on; + } - // refine streamer - ailego::Params refine_stg_params; - auto refine_storage = IndexFactory::CreateStorage("MMapFileStorage"); - ASSERT_EQ(0, refine_storage->init(refine_stg_params)); - ASSERT_EQ(0, - refine_storage->open(dir_ + "TestBasicRefinerRefine.index", true)); - ASSERT_EQ(0, refine_streamer->init(index_meta, params)); - ASSERT_EQ(0, refine_streamer->open(refine_storage)); - auto refine_ctx = refine_streamer->create_context(); - ASSERT_TRUE(!!refine_ctx); + std::string new_vec; - ailego::Params refiner_params; - ASSERT_EQ(0, refiner->init(base_streamer, refine_streamer, refiner_params)); + ASSERT_EQ(0, quantizer->convert(vec.data(), qmeta, &new_vec, &new_meta)); + ASSERT_EQ(0, streamer->add_impl(i, new_vec.data(), new_meta, ctx)); + } - auto ctx = refiner->create_context(); - ASSERT_TRUE(!!ctx); + for (size_t i = 0; i < cnt; i++) { + float add_on = i * 10; - IndexQueryMeta qmeta(IndexMeta::DataType::DT_FP32, dimension); + const void *vector = streamer->get_vector(i); + ASSERT_NE(vector, nullptr); - std::random_device rd; - std::mt19937 gen(rd()); + std::string denormalized_vec; + denormalized_vec.resize(dim * sizeof(float)); + quantizer->revert(vector, new_meta, &denormalized_vec); - std::uniform_real_distribution dist(-2.0, 2.0); - std::vector> vecs; + float vector_value = *((float *)(denormalized_vec.data()) + dim - 1); + EXPECT_NEAR(vector_value, fixed_value + add_on, epsilon); + } - size_t cnt = 5000U; - for (size_t i = 0; i < cnt; i++) { - NumericalVector vec(dimension); - for (size_t j = 0; j < dimension; ++j) { - vec[j] = dist(gen); + auto linearCtx = streamer->create_context(); + linearCtx->set_fetch_vector(true); + auto knnCtx = streamer->create_context(); + knnCtx->set_fetch_vector(true); + + size_t query_cnt = 200U; + size_t topk = 200; + linearCtx->set_topk(topk); + knnCtx->set_topk(topk); + uint64_t knnTotalTime = 0; + uint64_t linearTotalTime = 0; + for (size_t i = 0; i < query_cnt; i++) { + float add_on = i * 10; + for (size_t j = 0; j < dim; ++j) { + if (j < dim / 4) + vec[j] = fixed_value; + else + vec[j] = fixed_value + add_on; } - std::string binary_vec; - IndexQueryMeta binary_qmeta; + std::string new_query; + IndexQueryMeta new_meta; + ASSERT_EQ(0, quantizer->quantize(vec.data(), qmeta, &new_query, &new_meta)); + auto t1 = ailego::Realtime::MicroSeconds(); + ASSERT_EQ(0, streamer->search_impl(new_query.data(), new_meta, knnCtx)); + auto t2 = ailego::Realtime::MicroSeconds(); ASSERT_EQ(0, - reformer->convert(vec.data(), qmeta, &binary_vec, &binary_qmeta)); - ASSERT_EQ(0, refiner->add_impl(i, binary_vec.data(), binary_qmeta, - vec.data(), qmeta, ctx)); - - vecs.push_back(vec); - } + streamer->search_bf_impl(new_query.data(), new_meta, linearCtx)); + auto t3 = ailego::Realtime::MicroSeconds(); - size_t query_cnt = 200U; - // size_t query_cnt = 1U; + knnTotalTime += t2 - t1; + linearTotalTime += t3 - t2; - auto searcherCtx = refiner->create_context(); + auto &knnResult = knnCtx->result(); + ASSERT_EQ(topk, knnResult.size()); - for (size_t i = 0; i < query_cnt; i++) { - auto &vec = vecs[i]; + auto &linearResult = linearCtx->result(); + ASSERT_EQ(topk, linearResult.size()); + ASSERT_EQ(i, linearResult[0].key()); - // float abs_value{0}; - // for (size_t j = 0; j < dimension; ++j) { - // std::cout << "dim: " << j << ", value: " << vec[j] << std::endl; + ASSERT_NE(knnResult[0].vector(), nullptr); + ASSERT_NE(linearResult[0].vector(), nullptr); - // abs_value += std::abs(vec[j]); - // } - // std::cout << "abs value: " << abs_value << std::endl; + std::string denormalized_vec; + denormalized_vec.resize(dim * sizeof(float)); + quantizer->dequantize(linearResult[0].vector(), new_meta, + &denormalized_vec); - std::string new_query; - IndexQueryMeta binary_qmeta; - ASSERT_EQ( - 0, reformer->transform(vec.data(), qmeta, &new_query, &binary_qmeta)); - - size_t topk = 50; - searcherCtx->set_topk(topk); - ASSERT_EQ(0, refiner->search_impl(new_query.data(), binary_qmeta, - vec.data(), qmeta, searcherCtx)); - auto &results = searcherCtx->result(); - ASSERT_EQ(topk, results.size()); - ASSERT_EQ(i, results[0].key()); - - // for (size_t i = 0; i < results.size(); ++i) { - // std::cout << i << ", id: " << results[i].index() - // << ", score: " << results[i].score() << std::endl; - // } + float vector_value = *(((float *)(denormalized_vec.data()) + dim - 1)); + EXPECT_NEAR(vector_value, fixed_value + add_on, epsilon); } -} - -#endif + std::cout << "knnTotalTime: " << knnTotalTime << std::endl; + std::cout << "linearTotalTime: " << linearTotalTime << std::endl; +} } // namespace core } // namespace zvec From 868678072563e5573b11f0d92b5d40587d38053e Mon Sep 17 00:00:00 2001 From: ray Date: Thu, 16 Apr 2026 21:01:38 +0800 Subject: [PATCH 44/44] feat: add quantizer --- .../record_int4_quantizer.cc | 0 .../record_int8_quantizer.cc | 21 ++++++++ .../reocrd_int8_quantier.h | 48 +++++++++++++++++++ src/turbo/quantizer/quantizer.h | 33 +++++++++++++ 4 files changed, 102 insertions(+) create mode 100644 src/turbo/quantizer/RecordInt4Quantizer/record_int4_quantizer.cc create mode 100644 src/turbo/quantizer/RecordInt8Quantizer/record_int8_quantizer.cc create mode 100644 src/turbo/quantizer/RecordInt8Quantizer/reocrd_int8_quantier.h create mode 100644 src/turbo/quantizer/quantizer.h diff --git a/src/turbo/quantizer/RecordInt4Quantizer/record_int4_quantizer.cc b/src/turbo/quantizer/RecordInt4Quantizer/record_int4_quantizer.cc new file mode 100644 index 000000000..e69de29bb diff --git a/src/turbo/quantizer/RecordInt8Quantizer/record_int8_quantizer.cc b/src/turbo/quantizer/RecordInt8Quantizer/record_int8_quantizer.cc new file mode 100644 index 000000000..72617e56b --- /dev/null +++ b/src/turbo/quantizer/RecordInt8Quantizer/record_int8_quantizer.cc @@ -0,0 +1,21 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#pragma once + +namespace zvec { +namespace turbo {} // namespace turbo +} // namespace zvec \ No newline at end of file diff --git a/src/turbo/quantizer/RecordInt8Quantizer/reocrd_int8_quantier.h b/src/turbo/quantizer/RecordInt8Quantizer/reocrd_int8_quantier.h new file mode 100644 index 000000000..8e083ae25 --- /dev/null +++ b/src/turbo/quantizer/RecordInt8Quantizer/reocrd_int8_quantier.h @@ -0,0 +1,48 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#pragma once + +namespace zvec { +namespace turbo { + +class RecordInt8Quantizer : public Quantizer { + public: + RecordInt8Quantizer() : type_{QuantizeType::kRecordInt8} {} + + virtual ~RecordInt8Quantizer() {} + + public: + QuantizeType type() const override { + return type_; + } + + const IndexMeta &meta(void) const override { + return meta_; + } + + private: + IndexMeta meta_{}; + IndexHolder::Pointer holder_{}; + std::shared_ptr quantizer_{}; + Stats stats_{}; + IndexMeta::DataType data_type_{}; +}; + + +} // namespace turbo +} // namespace zvec diff --git a/src/turbo/quantizer/quantizer.h b/src/turbo/quantizer/quantizer.h new file mode 100644 index 000000000..b051a6d87 --- /dev/null +++ b/src/turbo/quantizer/quantizer.h @@ -0,0 +1,33 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#pragma once + +namespace zvec { +namespace turbo { + +class Quantizer { + public: + Quantizer() {}; + virtual ~Quantizer() {}; + + private: + QuantizeType type_{QuantizeType::kDefault}; +}; + +} // namespace turbo +} // namespace zvec