diff --git a/banquet.cpp b/banquet.cpp index bfa7821..f9b2b4d 100644 --- a/banquet.cpp +++ b/banquet.cpp @@ -481,13 +481,15 @@ banquet_signature_t banquet_sign(const banquet_instance_t &instance, // a vector of the first m2+1 field elements for interpolation std::vector x_values_for_interpolation_zero_to_m2 = field::get_first_n_field_elements(instance.m2 + 1); - std::vector> precomputation_for_zero_to_m2 = - field::precompute_lagrange_polynomials( - x_values_for_interpolation_zero_to_m2); + // a vector of the first 2 * m2+1 field elements for interpolation std::vector x_values_for_interpolation_zero_to_2m2 = field::get_first_n_field_elements(2 * instance.m2 + 1); + + std::vector> precomputation_for_zero_to_m2 = + field::precompute_lagrange_polynomials_slow( + x_values_for_interpolation_zero_to_m2); std::vector> precomputation_for_zero_to_2m2 = - field::precompute_lagrange_polynomials( + field::precompute_lagrange_polynomials_slow( x_values_for_interpolation_zero_to_2m2); std::vector>>> s_prime( diff --git a/field.cpp b/field.cpp index ed860d9..0aee604 100644 --- a/field.cpp +++ b/field.cpp @@ -1,7 +1,13 @@ #include "field.h" +#include #include +#include +#include +#include #include +#include +#include #include #include #include @@ -32,6 +38,17 @@ inline __m128i clmul(uint64_t a, uint64_t b) { return _mm_clmulepi64_si128(_mm_set_epi64x(0, a), _mm_set_epi64x(0, b), 0); } +// actually a bit slowerthan naive version below on some CPU version perhaps ?? +uint64_t reduce_GF2_16_barret(__m128i in) { + // modulus = x^16 + x^5 + x^3 + x + 1 + constexpr uint64_t lower_mask = 0xFFFFULL; + constexpr uint64_t P = + (1ULL << 16) | (1ULL << 5) | (1ULL << 3) | (1ULL << 1) | (1ULL << 0); + uint64_t R = _mm_cvtsi128_si64(in); + uint64_t T1 = _mm_cvtsi128_si64(clmul(R >> 16, P)); + uint64_t T2 = _mm_cvtsi128_si64(clmul(T1 >> 16, P)); + return lower_mask & (R ^ T2); +} uint64_t reduce_GF2_16(__m128i in) { // modulus = x^16 + x^5 + x^3 + x + 1 constexpr uint64_t lower_mask = 0xFFFFULL; @@ -44,17 +61,39 @@ uint64_t reduce_GF2_16(__m128i in) { (R_upper << 0); return lower_mask & R_lower; } +// actually this is slower than the above two :( +uint64_t reduce_GF2_16_clmul(const __m128i in) { + // modulus = x^16 + x^5 + x^3 + x + 1 + __m128i p = _mm_set_epi64x(0x0, 0x2B); + __m128i mask = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFF0000); + __m128i t; + + __m128i hi = _mm_srli_si128(in, 2); // extracting the in_hi + __m128i low = + _mm_xor_si128(_mm_or_si128(in, mask), mask); // extracting the low + + t = _mm_clmulepi64_si128(hi, p, 0x00); // in_hi_low(0x00) * p + t = _mm_xor_si128(t, low); // 4 + 16 -> Length after xor + + hi = _mm_srli_si128(t, 2); // extracting the t_hi + low = _mm_xor_si128(_mm_or_si128(t, mask), mask); // extracting the low + + t = _mm_clmulepi64_si128(hi, p, 0x00); // t_hi_low(0x00) * p + t = _mm_xor_si128(t, low); // 16 -> Length after xor + + return _mm_extract_epi64(t, 0); +} -// actually a bit slowerthan naive version below -__attribute__((unused)) uint64_t reduce_GF2_32_barret(__m128i in) { +// actually a bit slowerthan naive version below on some CPU version perhaps ?? +uint64_t reduce_GF2_32_barret(__m128i in) { // modulus = x^32 + x^7 + x^3 + x^2 + 1 + constexpr uint64_t lower_mask = 0xFFFFFFFFULL; constexpr uint64_t P = (1ULL << 32) | (1ULL << 7) | (1ULL << 3) | (1ULL << 2) | (1ULL << 0); - constexpr uint64_t mu = P; uint64_t R = _mm_cvtsi128_si64(in); - uint64_t T1 = _mm_cvtsi128_si64(clmul(R >> 32, mu)); + uint64_t T1 = _mm_cvtsi128_si64(clmul(R >> 32, P)); uint64_t T2 = _mm_cvtsi128_si64(clmul(T1 >> 32, P)); - return 0xFFFFFFFFULL & (R ^ T2); + return lower_mask & (R ^ T2); } uint64_t reduce_GF2_32(__m128i in) { // modulus = x^32 + x^7 + x^3 + x^2 + 1 @@ -69,6 +108,45 @@ uint64_t reduce_GF2_32(__m128i in) { return lower_mask & R_lower; } +// actually this is slower than the above two :( +uint64_t reduce_GF2_32_clmul(const __m128i in) { + // modulus = x^32 + x^7 + x^3 + x^2 + 1 + __m128i p = _mm_set_epi64x(0x0, 0x8d); + __m128i mask = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000); + __m128i t; + + __m128i hi = _mm_srli_si128(in, 4); // extracting the in_hi + __m128i low = + _mm_xor_si128(_mm_or_si128(in, mask), mask); // extracting the low + + t = _mm_clmulepi64_si128(hi, p, 0x00); // in_hi_low(0x00) * p + t = _mm_xor_si128(t, low); // 4 + 32 -> Length after xor + + hi = _mm_srli_si128(t, 4); // extracting the t_hi + low = _mm_xor_si128(_mm_or_si128(t, mask), mask); // extracting the low + + t = _mm_clmulepi64_si128(hi, p, 0x00); // t_hi_low(0x00) * p + t = _mm_xor_si128(t, low); // 16 -> Length after xor + + return _mm_extract_epi64(t, 0); +} + +// actually a bit slowerthan naive version below on some CPU version perhaps ?? +uint64_t reduce_GF2_40_barret(__m128i in) { + // modulus = x^40 + x^5 + x^4 + x^3 + 1 + constexpr uint64_t upper_mask = 0xFFFFULL; + constexpr uint64_t lower_mask = 0xFFFFFFFFFFULL; + constexpr uint64_t P = + (1ULL << 40) | (1ULL << 5) | (1ULL << 4) | (1ULL << 3) | (1ULL << 0); + uint64_t R_lower = _mm_extract_epi64(in, 0); + uint64_t R_upper = + ((_mm_extract_epi64(in, 1) & upper_mask) << 24) | (R_lower >> 40); + uint64_t T1 = _mm_cvtsi128_si64(clmul(R_upper, P)); + T1 = ((_mm_extract_epi64(in, 1) & upper_mask) << 24) | (T1 >> 40); + uint64_t T2 = _mm_cvtsi128_si64(clmul(T1, P)); + return lower_mask & (R_lower ^ T2); +} + uint64_t reduce_GF2_40(__m128i in) { // modulus = x^40 + x^5 + x^4 + x^3 + 1 constexpr uint64_t upper_mask = 0xFFFFULL; @@ -84,6 +162,44 @@ uint64_t reduce_GF2_40(__m128i in) { return lower_mask & R_lower; } +uint64_t reduce_GF2_40_clmul(const __m128i in) { + // modulus = x^40 + x^5 + x^4 + x^3 + 1 + __m128i p = _mm_set_epi64x(0x0, 0x39); + __m128i mask = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFFFF0000000000); + __m128i t; + + __m128i hi = _mm_srli_si128(in, 5); // extracting the hi + __m128i low = + _mm_xor_si128(_mm_or_si128(in, mask), mask); // extracting the low + + t = _mm_clmulepi64_si128(hi, p, 0x00); // hi_low(0x00) * p + t = _mm_xor_si128(t, low); // 4 + 40 -> Length after xor + + hi = _mm_srli_si128(t, 5); // extracting the hi + low = _mm_xor_si128(_mm_or_si128(t, mask), mask); // extracting the low + + t = _mm_clmulepi64_si128(hi, p, 0x00); // hi_low(0x00) * p + t = _mm_xor_si128(t, low); // 40 -> Length after xor + + return _mm_extract_epi64(t, 0); +} + +// actually a bit slowerthan naive version below on some CPU version perhaps ?? +uint64_t reduce_GF2_48_barret(__m128i in) { + // modulus = x^48 + x^5 + x^3 + x^2 + 1 + constexpr uint64_t upper_mask = 0xFFFFFFFFULL; + constexpr uint64_t lower_mask = 0xFFFFFFFFFFFFULL; + constexpr uint64_t P = + (1ULL << 48) | (1ULL << 5) | (1ULL << 3) | (1ULL << 2) | (1ULL << 0); + uint64_t R_lower = _mm_extract_epi64(in, 0); + uint64_t R_upper = + ((_mm_extract_epi64(in, 1) & upper_mask) << 16) | (R_lower >> 48); + uint64_t T1 = _mm_cvtsi128_si64(clmul(R_upper, P)); + T1 = ((_mm_extract_epi64(in, 1) & upper_mask) << 16) | (T1 >> 48); + uint64_t T2 = _mm_cvtsi128_si64(clmul(T1, P)); + return lower_mask & (R_lower ^ T2); +} + uint64_t reduce_GF2_48(__m128i in) { // modulus = x^48 + x^5 + x^3 + x^2 + 1 constexpr uint64_t upper_mask = 0xFFFFFFFFULL; @@ -98,6 +214,29 @@ uint64_t reduce_GF2_48(__m128i in) { return lower_mask & R_lower; } +// actually this is slower than the above two :( +uint64_t reduce_GF2_48_clmul(const __m128i in) { + // modulus = x^48 + x^5 + x^3 + x^2 + 1 + __m128i p = _mm_set_epi64x(0x0, 0x2d); + __m128i mask = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF000000000000); + __m128i t; + + __m128i hi = _mm_srli_si128(in, 6); // extracting the hi + __m128i low = + _mm_xor_si128(_mm_or_si128(in, mask), mask); // extracting the low + + t = _mm_clmulepi64_si128(hi, p, 0x00); // hi_low(0x00) * p + t = _mm_xor_si128(t, low); // 4 + 48 -> Length after xor + + hi = _mm_srli_si128(t, 6); // extracting the hi + low = _mm_xor_si128(_mm_or_si128(t, mask), mask); // extracting the low + + t = _mm_clmulepi64_si128(hi, p, 0x00); // hi_low(0x00) * p + t = _mm_xor_si128(t, low); // 48 -> Length after xor + + return _mm_extract_epi64(t, 0); +} + uint64_t GF2_euclidean_div_quotient(uint64_t a, uint64_t b) { uint64_t quotient = 0; int diff = __builtin_clzl(b) - __builtin_clzl(a); @@ -134,7 +273,9 @@ uint64_t mod_inverse(uint64_t a, uint64_t mod) { namespace field { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wignored-attributes" -std::function GF2E::reduce = nullptr; +uint64_t (*GF2E::reduce_naive)(__m128i); +uint64_t (*GF2E::reduce_barret)(__m128i); +uint64_t (*GF2E::reduce_clmul)(__m128i); #pragma GCC diagnostic pop size_t GF2E::byte_size = 0; uint64_t GF2E::modulus = 0; @@ -154,10 +295,10 @@ GF2E &GF2E::operator-=(const GF2E &other) { return *this; } GF2E GF2E::operator*(const GF2E &other) const { - return GF2E(reduce(clmul(this->data, other.data))); + return GF2E(reduce_clmul(clmul(this->data, other.data))); } GF2E &GF2E::operator*=(const GF2E &other) { - this->data = reduce(clmul(this->data, other.data)); + this->data = reduce_clmul(clmul(this->data, other.data)); return *this; } bool GF2E::operator==(const GF2E &other) const { @@ -167,18 +308,138 @@ bool GF2E::operator!=(const GF2E &other) const { return this->data != other.data; } +std::ostream &operator<<(std::ostream &os, const GF2E &ele) { + os << "0x" << std::setfill('0') << std::hex << std::setw(8) << ele.get_data(); + return os; +} + GF2E GF2E::inverse() const { return GF2E(mod_inverse(this->data, modulus)); } +// This is actually slow than the other inverse inplementation but this has +// constant time, so gets better with larger root size and field size +GF2E GF2E::inverse_const_time() const { + + switch (this->byte_size) { + case 2: { + constexpr uint64_t u[6] = {1, 2, 3, 6, 12, 15}; + constexpr uint64_t u_len = sizeof(u) / sizeof(u[0]); + // q = u[i] - u[i - 1] should give us the corresponding values + // (1, 1, 3, 6, 3), which will have corresponding indexes + constexpr uint64_t q_index[u_len - 1] = {0, 0, 2, 3, 2}; + GF2E b[u_len]; + b[0] = this->data; + for (size_t i = 1; i < u_len; ++i) { + GF2E b_p = b[i - 1]; + GF2E b_q = b[q_index[i - 1]]; + for (uint64_t m = u[q_index[i - 1]]; m; --m) { + b_p = b_p.sqr(); + } + b[i] = reduce_clmul(clmul(b_p.data, b_q.data)); + } + return b[u_len - 1].sqr(); + break; + } + case 4: { + constexpr uint64_t u[8] = {1, 2, 3, 5, 7, 14, 28, 31}; + constexpr uint64_t u_len = sizeof(u) / sizeof(u[0]); + // q = u[i] - u[i - 1] should give us the corresponding values + // (1, 1, 2, 2, 7, 14, 3), which will have corresponding indexes + constexpr uint64_t q_index[u_len - 1] = {0, 0, 1, 1, 4, 5, 2}; + GF2E b[u_len]; + b[0] = this->data; + for (size_t i = 1; i < u_len; ++i) { + GF2E b_p = b[i - 1]; + GF2E b_q = b[q_index[i - 1]]; + for (uint64_t m = u[q_index[i - 1]]; m; --m) { + b_p = b_p.sqr(); + } + b[i] = reduce_clmul(clmul(b_p.data, b_q.data)); + } + return b[u_len - 1].sqr(); + break; + } + case 5: { + constexpr uint64_t u[8] = {1, 2, 3, 6, 12, 24, 27, 39}; + constexpr uint64_t u_len = sizeof(u) / sizeof(u[0]); + // q = u[i] - u[i - 1] should give us the corresponding values + // (1, 1, 3, 6, 12, 3, 12), which will have corresponding indexes + constexpr uint64_t q_index[u_len - 1] = {0, 0, 2, 3, 4, 2, 4}; + GF2E b[u_len]; + b[0] = this->data; + for (size_t i = 1; i < u_len; ++i) { + GF2E b_p = b[i - 1]; + GF2E b_q = b[q_index[i - 1]]; + for (uint64_t m = u[q_index[i - 1]]; m; --m) { + b_p = b_p.sqr(); + } + b[i] = reduce_clmul(clmul(b_p.data, b_q.data)); + } + return b[u_len - 1].sqr(); + break; + } + case 6: { + constexpr uint64_t u[9] = {1, 2, 3, 5, 10, 20, 23, 46, 47}; + constexpr uint64_t u_len = sizeof(u) / sizeof(u[0]); + // q = u[i] - u[i - 1] should give us the corresponding values + // (1, 1, 2, 5, 10, 3, 23, 1), which will have corresponding indexes + constexpr uint64_t q_index[u_len - 1] = {0, 0, 1, 3, 4, 2, 6, 0}; + GF2E b[u_len]; + b[0] = this->data; + for (size_t i = 1; i < u_len; ++i) { + GF2E b_p = b[i - 1]; + GF2E b_q = b[q_index[i - 1]]; + for (uint64_t m = u[q_index[i - 1]]; m; --m) { + b_p = b_p.sqr(); + } + b[i] = reduce_clmul(clmul(b_p.data, b_q.data)); + } + return b[u_len - 1].sqr(); + break; + } + default: + return GF2E(this->data); + break; + } +} + +// This is not faster than calling the operand because we are squaring numbers +// in GF2_X where X is less than 64 +GF2E GF2E::sqr() const { + + __m128i tmp[2]; + __m128i res; + + __m128i sqrT = _mm_set_epi64x(0x5554515045444140, 0x1514111005040100); + __m128i mask = _mm_set_epi64x(0, 0x0F0F0F0F0F0F0F0F); + + __m128i a = _mm_set_epi64x(0, this->data); + + tmp[0] = _mm_and_si128(a, mask); + + tmp[1] = _mm_srli_epi64(a, 4); + tmp[1] = _mm_and_si128(tmp[1], mask); + + tmp[0] = _mm_shuffle_epi8(sqrT, tmp[0]); + tmp[1] = _mm_shuffle_epi8(sqrT, tmp[1]); + + res = _mm_unpacklo_epi8(tmp[0], tmp[1]); + + return GF2E(reduce_clmul(res)); +} + void GF2E::to_bytes(uint8_t *out) const { uint64_t be_data = htole64(data); memcpy(out, (uint8_t *)(&be_data), byte_size); } + std::vector GF2E::to_bytes() const { std::vector buffer(byte_size); this->to_bytes(buffer.data()); return buffer; } +uint64_t GF2E::get_data() const { return this->data; } + void GF2E::from_bytes(uint8_t *in) { data = 0; memcpy((uint8_t *)(&data), in, byte_size); @@ -191,7 +452,9 @@ void GF2E::init_extension_field(const banquet_instance_t &instance) { // modulus = x^16 + x^5 + x^3 + x + 1 modulus = (1ULL << 16) | (1ULL << 5) | (1ULL << 3) | (1ULL << 1) | (1ULL << 0); - reduce = reduce_GF2_16; + reduce_naive = reduce_GF2_16; + reduce_barret = reduce_GF2_16_barret; + reduce_clmul = reduce_GF2_16_clmul; byte_size = 2; // Ring morphism: // From: Finite Field in x of size 2^8 @@ -212,7 +475,9 @@ void GF2E::init_extension_field(const banquet_instance_t &instance) { // modulus = x^32 + x^7 + x^3 + x^2 + 1 modulus = (1ULL << 32) | (1ULL << 7) | (1ULL << 3) | (1ULL << 2) | (1ULL << 0); - reduce = reduce_GF2_32; + reduce_naive = reduce_GF2_32; + reduce_barret = reduce_GF2_32_barret; + reduce_clmul = reduce_GF2_32_clmul; byte_size = 4; // Ring morphism: // From: Finite Field in x of size 2^8 @@ -241,7 +506,9 @@ void GF2E::init_extension_field(const banquet_instance_t &instance) { // modulus = x^40 + x^5 + x^4 + x^3 + 1 modulus = (1ULL << 40) | (1ULL << 5) | (1ULL << 4) | (1ULL << 3) | (1ULL << 0); - reduce = reduce_GF2_40; + reduce_naive = reduce_GF2_40; + reduce_barret = reduce_GF2_40_barret; + reduce_clmul = reduce_GF2_40_clmul; byte_size = 5; // Ring morphism: // From: Finite Field in x of size 2^8 @@ -269,7 +536,9 @@ void GF2E::init_extension_field(const banquet_instance_t &instance) { // modulus = x^48 + x^5 + x^3 + x^2 + 1 modulus = (1ULL << 48) | (1ULL << 5) | (1ULL << 3) | (1ULL << 2) | (1ULL << 0); - reduce = reduce_GF2_48; + reduce_naive = reduce_GF2_48; + reduce_barret = reduce_GF2_48_barret; + reduce_clmul = reduce_GF2_48_clmul; byte_size = 6; // Ring morphism: // From: Finite Field in x of size 2^8 @@ -313,20 +582,82 @@ void GF2E::init_extension_field(const banquet_instance_t &instance) { const GF2E &lift_uint8_t(uint8_t value) { return lifting_lut[value]; } -std::vector get_first_n_field_elements(size_t n) { - std::vector result; - result.reserve(n); - GF2E x(2); - GF2E gen = x; - for (size_t i = 0; i < n; i++) { - result.push_back(gen); - gen = gen * x; +// Use to precompute the constants of the denominaotr.inverse() +std::vector precompute_denominator(const std::vector &x_values) { + // Check if value size is power of 2 + if (ceil(log2(x_values.size())) != floor(log2(x_values.size()))) { + throw std::runtime_error("invalid sizes for interpolation"); } - return result; + size_t values_size = x_values.size(); + std::vector precomputed_denominator; + precomputed_denominator.reserve(values_size); + GF2E denominator; + + for (size_t k = 0; k < values_size; ++k) { + denominator = GF2E(1); + for (size_t i = 0; i < values_size; ++i) { + if (i != k) { + denominator *= x_values[k] - x_values[i]; + } + } + precomputed_denominator.push_back(denominator.inverse()); + } + + return precomputed_denominator; +} + +void set_x_minus_xi_poly_size( + std::vector> &precomputed_x_minus_xi, size_t root_count) { + size_t split_count = 0; + for (size_t i = root_count; i > 1; i /= 2) { + split_count += i; + } + precomputed_x_minus_xi.reserve(split_count); +} + +// Use to precompute x - xi recurssively +void precompute_x_minus_xi_poly_splits( + const std::vector &x_values, + std::vector> &precomputed_x_minus_xi) { + + size_t len = x_values.size(); + if (len == 1) { + return; + } + size_t len_half = len / 2; + + // Gets the first half roots + std::vector x_first_half_roots; + x_first_half_roots.reserve(len / 2); + for (size_t i = 0; i < len_half; ++i) { + x_first_half_roots.push_back(x_values[i]); + } + // Generates poly from roots + std::vector x_first_half_poly = build_from_roots(x_first_half_roots); + // Save poly + precomputed_x_minus_xi.push_back(x_first_half_poly); + // Recurssion with the first half roots + precompute_x_minus_xi_poly_splits(x_first_half_roots, precomputed_x_minus_xi); + + // Gets the second half roots + std::vector x_second_half_roots; + x_second_half_roots.reserve(len / 2); + for (size_t i = len_half; i < len; ++i) { + x_second_half_roots.push_back(x_values[i]); + } + // Generates poly from roots + std::vector x_second_half_poly = build_from_roots(x_second_half_roots); + // Save poly + precomputed_x_minus_xi.push_back(x_second_half_poly); + // Recurssion with the second half roots + precompute_x_minus_xi_poly_splits(x_second_half_roots, + precomputed_x_minus_xi); } +// Computing the precomputable part of the plain langrange interpolation +// (not-optimized) std::vector> -precompute_lagrange_polynomials(const std::vector &x_values) { +precompute_lagrange_polynomials_slow(const std::vector &x_values) { size_t m = x_values.size(); std::vector> precomputed_lagrange_polynomials; precomputed_lagrange_polynomials.reserve(m); @@ -352,6 +683,28 @@ precompute_lagrange_polynomials(const std::vector &x_values) { return precomputed_lagrange_polynomials; } +// Computing the precomputable part of the plain langrange interpolation +// (optimized) (fast only with large root sizes) +std::vector> +precompute_lagrange_polynomials(const std::vector &x_values) { + + size_t m = x_values.size(); + std::vector> precomputed_lagrange_polynomials; + precomputed_lagrange_polynomials.reserve(m); + + std::vector x_minus_xi = build_from_roots(x_values); + for (size_t k = 0; k < m; ++k) { + + std::vector numerator = x_minus_xi / x_values[k]; + + precomputed_lagrange_polynomials.push_back( + eval(numerator, x_values[k]).inverse() * numerator); + } + + return precomputed_lagrange_polynomials; +} + +// Langrange interpolation with precomputation (slow) std::vector interpolate_with_precomputation( const std::vector> &precomputed_lagrange_polynomials, const std::vector &y_values) { @@ -367,6 +720,71 @@ std::vector interpolate_with_precomputation( return res; } +// Langrange interpolation with precomputation (fast) +std::vector interpolate_with_precomputation( + const std::vector &precomputed_denominator, + const std::vector &y_values, const size_t index) { + + std::vector a_precomputed_denominator; + a_precomputed_denominator.reserve(1); + a_precomputed_denominator.push_back(precomputed_denominator[index]); + + return y_values[index] * a_precomputed_denominator; +} + +// Langrange interpolation using recurssion (fast) +std::vector interpolate_with_recurrsion( + const std::vector &y_values, + const std::vector &precomputed_denominator, + const std::vector> &precomputed_x_minus_xi, + const size_t x_start_index, const size_t x_length, + const size_t x_minus_xi_first_index, const size_t x_minus_xi_length) { + + // For indexing x_values + const size_t x_len_half = x_length / 2; + const size_t x_end_index = x_start_index + x_length - 1; + const size_t x_second_half_start_index = x_start_index + x_len_half; + + // For indexing x_minus_xi values + const size_t x_minus_xi_half_length = x_minus_xi_length / 2; + const size_t x_minus_xi_second_index = + x_minus_xi_half_length + x_minus_xi_first_index; + + // The recurssion part !! + if (x_length != 2) { + + return (precomputed_x_minus_xi[x_minus_xi_second_index] * + interpolate_with_recurrsion(y_values, precomputed_denominator, + precomputed_x_minus_xi, x_start_index, + x_len_half, x_minus_xi_first_index + 1, + x_minus_xi_half_length - 1)) + + (precomputed_x_minus_xi[x_minus_xi_first_index] * + interpolate_with_recurrsion( + y_values, precomputed_denominator, precomputed_x_minus_xi, + x_second_half_start_index, x_len_half, + x_minus_xi_second_index + 1, x_minus_xi_half_length - 1)); + } + + return (precomputed_x_minus_xi[x_minus_xi_second_index] * + interpolate_with_precomputation(precomputed_denominator, y_values, + x_start_index)) + + (precomputed_x_minus_xi[x_minus_xi_first_index] * + interpolate_with_precomputation(precomputed_denominator, y_values, + x_end_index)); +} + +std::vector get_first_n_field_elements(size_t n) { + std::vector result; + result.reserve(n); + GF2E x(2); + GF2E gen = x; + for (size_t i = 0; i < n; i++) { + result.push_back(gen); + gen = gen * x; + } + return result; +} + std::vector build_from_roots(const std::vector &roots) { size_t len = roots.size(); @@ -437,7 +855,7 @@ field::GF2E dot_product(const std::vector &lhs, for (size_t i = 0; i < lhs.size(); i++) accum = _mm_xor_si128(accum, clmul(lhs[i].data, rhs[i].data)); - field::GF2E result(field::GF2E::reduce(accum)); + field::GF2E result(field::GF2E::reduce_clmul(accum)); return result; } @@ -464,5 +882,24 @@ std::vector operator*(const std::vector &lhs, for (size_t j = 0; j < rhs.size(); j++) result[i + j] += lhs[i] * rhs[j]; + return result; +} + +// polynomial division of x^n*c^n + x^n-1*c^n-1 + .... by x - a +std::vector operator/(const std::vector &lhs, + const field::GF2E &rhs) { + std::vector temp(lhs); + size_t end_index = temp.size() - 1; + + std::vector result; + result.reserve(end_index); + + for (size_t i = end_index; i; --i) { + temp[i - 1] -= temp[i] * rhs; + } + for (size_t i = 1; i <= end_index; ++i) { + result.push_back(temp[i]); + } + return result; } \ No newline at end of file diff --git a/field.h b/field.h index a489872..968653b 100644 --- a/field.h +++ b/field.h @@ -3,7 +3,10 @@ #include "banquet_instances.h" #include #include +#include #include +#include +#include extern "C" { #include #include @@ -22,7 +25,9 @@ class GF2E { uint64_t data; #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wignored-attributes" - static std::function reduce; + static uint64_t (*reduce_naive)(__m128i); + static uint64_t (*reduce_barret)(__m128i); + static uint64_t (*reduce_clmul)(__m128i); #pragma GCC diagnostic pop static size_t byte_size; static uint64_t modulus; @@ -47,6 +52,10 @@ class GF2E { GF2E inverse() const; + GF2E sqr() const; + + GF2E inverse_const_time() const; + void to_bytes(uint8_t *out) const; std::vector to_bytes() const; void from_bytes(uint8_t *in); @@ -54,13 +63,61 @@ class GF2E { friend GF2E(::dot_product)(const std::vector &lhs, const std::vector &rhs); + + GF2E(std::string hex_string) { + // check if hex_string start with 0x or 0X + if (hex_string.rfind("0x", 0) == 0 || hex_string.rfind("0X", 0) == 0) { + hex_string = hex_string.substr(2); + } else { + throw std::runtime_error("input needs to be a hex number"); + } + constexpr size_t num_hex_chars = 64 / 4; + if (hex_string.length() > num_hex_chars) + throw std::runtime_error("input hex is too large"); + // pad to 128 bit + hex_string.insert(hex_string.begin(), num_hex_chars - hex_string.length(), + '0'); + + data = std::stoull(hex_string.substr(0, 64 / 4), nullptr, 16); + } + + uint64_t get_data() const; }; +std::ostream &operator<<(std::ostream &os, const GF2E &ele); const GF2E &lift_uint8_t(uint8_t value); -std::vector get_first_n_field_elements(size_t n); +std::vector precompute_denominator(const std::vector &x_values); + +void set_x_minus_xi_poly_size( + std::vector> &precomputed_x_minus_xi, size_t root_count); + +void precompute_x_minus_xi_poly_splits( + const std::vector &x_values, + std::vector> &precomputed_x_minus_xi); + +std::vector> +precompute_lagrange_polynomials_slow(const std::vector &x_values); + std::vector> precompute_lagrange_polynomials(const std::vector &x_values); + +std::vector interpolate_with_precomputation( + const std::vector> &precomputed_lagrange_polynomials, + const std::vector &y_values); + +std::vector interpolate_with_precomputation( + const std::vector &precomputed_denominator, + const std::vector &y_values, const size_t index); + +std::vector interpolate_with_recurrsion( + const std::vector &y_values, + const std::vector &precomputed_denominator, + const std::vector> &precomputed_x_minus_xi, + const size_t x_start_index, const size_t x_length, + const size_t x_minus_xi_first_index, const size_t x_minus_xi_length); + +std::vector get_first_n_field_elements(size_t n); std::vector interpolate_with_precomputation( const std::vector> &precomputed_lagrange_polynomials, const std::vector &y_values); @@ -78,4 +135,6 @@ std::vector operator*(const std::vector &lhs, std::vector operator*(const field::GF2E &lhs, const std::vector &rhs); std::vector operator*(const std::vector &lhs, - const std::vector &rhs); \ No newline at end of file + const std::vector &rhs); +std::vector operator/(const std::vector &lhs, + const field::GF2E &rhs); \ No newline at end of file diff --git a/tests/field_test.cpp b/tests/field_test.cpp index 1d86ba5..3d134b0 100644 --- a/tests/field_test.cpp +++ b/tests/field_test.cpp @@ -67,7 +67,9 @@ TEST_CASE("Modular Arithmetic GF(2^32)", "[field]") { field::GF2E b_2(234130046); REQUIRE(a * b == ab); REQUIRE(a * a == a_2); + REQUIRE(a.sqr() == a_2); REQUIRE(b * b == b_2); + REQUIRE(b.sqr() == b_2); } TEST_CASE("Modular Arithmetic GF(2^40)", "[field]") { @@ -79,6 +81,7 @@ TEST_CASE("Modular Arithmetic GF(2^40)", "[field]") { field::GF2E b_2(153370336291ULL); REQUIRE(a * b == ab); REQUIRE(a * a == a_2); + REQUIRE(a.sqr() == a_2); REQUIRE(b * b == b_2); } @@ -91,6 +94,7 @@ TEST_CASE("Modular Arithmetic GF(2^48)", "[field]") { field::GF2E b_2(119412372920018ULL); REQUIRE(a * b == ab); REQUIRE(a * a == a_2); + REQUIRE(a.sqr() == a_2); REQUIRE(b * b == b_2); } @@ -229,15 +233,32 @@ TEST_CASE("NTL inverse == custom", "[field]") { field::GF2E b = a.inverse(); field::GF2E c = utils::ntl_to_custom(inv(utils::custom_to_ntl(a))); - std::cout << utils::custom_to_ntl(a) << ", " << utils::custom_to_ntl(b) - << ", " << utils::custom_to_ntl(c) << "\n"; - std::cout << utils::custom_to_ntl(a * b) << ", " - << utils::custom_to_ntl(a * c) << ", " - << utils::custom_to_ntl(a) * utils::custom_to_ntl(c) << "\n"; REQUIRE(b == c); REQUIRE(a * b == field::GF2E(1)); } } + +TEST_CASE("Constant time inverse == custom", "[field]") { + + // Checking for GF2_32 + field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param1)); + field::GF2E b; + b.set_coeff(31); + REQUIRE(b.inverse_const_time() == b.inverse()); + + // Checking for GF2_40 + field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param3)); + field::GF2E c; + c.set_coeff(39); + REQUIRE(c.inverse_const_time() == c.inverse()); + + // Checking for GF2_48 + field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param4)); + field::GF2E d; + d.set_coeff(47); + REQUIRE(d.inverse_const_time() == d.inverse()); +} + TEST_CASE("NTL interpolation == custom", "[field]") { field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param1)); utils::init_extension_field(banquet_instance_get(Banquet_L1_Param1)); @@ -255,7 +276,7 @@ TEST_CASE("NTL interpolation == custom", "[field]") { } std::vector> a_lag = - field::precompute_lagrange_polynomials(a); + field::precompute_lagrange_polynomials_slow(a); std::vector b_lag = utils::precompute_lagrange_polynomials(b); REQUIRE(a_lag.size() == b_lag.size()); @@ -265,4 +286,57 @@ TEST_CASE("NTL interpolation == custom", "[field]") { REQUIRE(a_lag[i][j] == utils::ntl_to_custom(b_lag[i][j])); } } +} + +TEST_CASE("optmized custom == custom interpolation", "[field]") { + field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param1)); + const size_t ROOT_SIZE = 128; + + std::vector x = field::get_first_n_field_elements(ROOT_SIZE); + std::vector y = field::get_first_n_field_elements(ROOT_SIZE); + std::vector> a_lag = + field::precompute_lagrange_polynomials_slow(x); + std::vector result = + field::interpolate_with_precomputation(a_lag, y); + + std::vector x_opti = + field::get_first_n_field_elements(ROOT_SIZE); + std::vector y_opti = + field::get_first_n_field_elements(ROOT_SIZE); + std::vector> x_lag = + field::precompute_lagrange_polynomials(x_opti); + std::vector result_optim = + field::interpolate_with_precomputation(x_lag, y_opti); + + REQUIRE(result == result_optim); +} + +TEST_CASE("fast interpolation == optmized custom interpolation", "[field]") { + field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param1)); + const size_t ROOT_SIZE = 128; + + std::vector x_opti = + field::get_first_n_field_elements(ROOT_SIZE); + std::vector y_opti = + field::get_first_n_field_elements(ROOT_SIZE); + std::vector> x_lag = + field::precompute_lagrange_polynomials(x_opti); + std::vector result_optim = + field::interpolate_with_precomputation(x_lag, y_opti); + + std::vector x_fast = + field::get_first_n_field_elements(ROOT_SIZE); + std::vector y_fast = + field::get_first_n_field_elements(ROOT_SIZE); + std::vector precomputed_denominator = + field::precompute_denominator(x_fast); + std::vector> precomputed_x_minus_xi; + field::set_x_minus_xi_poly_size(precomputed_x_minus_xi, x_fast.size()); + field::precompute_x_minus_xi_poly_splits(x_fast, precomputed_x_minus_xi); + + std::vector result_fast = field::interpolate_with_recurrsion( + y_fast, precomputed_denominator, precomputed_x_minus_xi, 0, x_fast.size(), + 0, precomputed_x_minus_xi.size()); + + REQUIRE(result_fast == result_optim); } \ No newline at end of file