diff --git a/banquet.cpp b/banquet.cpp
index bfa7821..f9b2b4d 100644
--- a/banquet.cpp
+++ b/banquet.cpp
@@ -481,13 +481,15 @@ banquet_signature_t banquet_sign(const banquet_instance_t &instance,
   // a vector of the first m2+1 field elements for interpolation
   std::vector<field::GF2E> x_values_for_interpolation_zero_to_m2 =
       field::get_first_n_field_elements(instance.m2 + 1);
-  std::vector<std::vector<field::GF2E>> precomputation_for_zero_to_m2 =
-      field::precompute_lagrange_polynomials(
-          x_values_for_interpolation_zero_to_m2);
+  // a vector of the first 2 * m2+1 field elements for interpolation
   std::vector<field::GF2E> x_values_for_interpolation_zero_to_2m2 =
       field::get_first_n_field_elements(2 * instance.m2 + 1);
+
+  std::vector<std::vector<field::GF2E>> precomputation_for_zero_to_m2 =
+      field::precompute_lagrange_polynomials_slow(
+          x_values_for_interpolation_zero_to_m2);
   std::vector<std::vector<field::GF2E>> precomputation_for_zero_to_2m2 =
-      field::precompute_lagrange_polynomials(
+      field::precompute_lagrange_polynomials_slow(
           x_values_for_interpolation_zero_to_2m2);
 
   std::vector<std::vector<std::vector<std::vector<field::GF2E>>>> s_prime(
diff --git a/field.cpp b/field.cpp
index ed860d9..0aee604 100644
--- a/field.cpp
+++ b/field.cpp
@@ -1,7 +1,13 @@
 #include "field.h"
 
+#include <algorithm>
 #include <array>
+#include <chrono>
+#include <cmath>
+#include <complex>
 #include <cstring>
+#include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <stdexcept>
 #include <vector>
@@ -32,6 +38,17 @@ inline __m128i clmul(uint64_t a, uint64_t b) {
   return _mm_clmulepi64_si128(_mm_set_epi64x(0, a), _mm_set_epi64x(0, b), 0);
 }
 
+// actually a bit slowerthan naive version below on some CPU version perhaps ??
+uint64_t reduce_GF2_16_barret(__m128i in) {
+  // modulus = x^16 + x^5 + x^3 + x + 1
+  constexpr uint64_t lower_mask = 0xFFFFULL;
+  constexpr uint64_t P =
+      (1ULL << 16) | (1ULL << 5) | (1ULL << 3) | (1ULL << 1) | (1ULL << 0);
+  uint64_t R = _mm_cvtsi128_si64(in);
+  uint64_t T1 = _mm_cvtsi128_si64(clmul(R >> 16, P));
+  uint64_t T2 = _mm_cvtsi128_si64(clmul(T1 >> 16, P));
+  return lower_mask & (R ^ T2);
+}
 uint64_t reduce_GF2_16(__m128i in) {
   // modulus = x^16 + x^5 + x^3 + x + 1
   constexpr uint64_t lower_mask = 0xFFFFULL;
@@ -44,17 +61,39 @@ uint64_t reduce_GF2_16(__m128i in) {
             (R_upper << 0);
   return lower_mask & R_lower;
 }
+// actually this is slower than the above two :(
+uint64_t reduce_GF2_16_clmul(const __m128i in) {
+  // modulus = x^16 + x^5 + x^3 + x + 1
+  __m128i p = _mm_set_epi64x(0x0, 0x2B);
+  __m128i mask = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFF0000);
+  __m128i t;
+
+  __m128i hi = _mm_srli_si128(in, 2); // extracting the in_hi
+  __m128i low =
+      _mm_xor_si128(_mm_or_si128(in, mask), mask); // extracting the low
+
+  t = _mm_clmulepi64_si128(hi, p, 0x00); // in_hi_low(0x00) * p
+  t = _mm_xor_si128(t, low);             // 4 + 16 -> Length after xor
+
+  hi = _mm_srli_si128(t, 2);                        // extracting the t_hi
+  low = _mm_xor_si128(_mm_or_si128(t, mask), mask); // extracting the low
+
+  t = _mm_clmulepi64_si128(hi, p, 0x00); // t_hi_low(0x00) * p
+  t = _mm_xor_si128(t, low);             // 16 -> Length after xor
+
+  return _mm_extract_epi64(t, 0);
+}
 
-// actually a bit slowerthan naive version below
-__attribute__((unused)) uint64_t reduce_GF2_32_barret(__m128i in) {
+// actually a bit slowerthan naive version below on some CPU version perhaps ??
+uint64_t reduce_GF2_32_barret(__m128i in) {
   // modulus = x^32 + x^7 + x^3 + x^2 + 1
+  constexpr uint64_t lower_mask = 0xFFFFFFFFULL;
   constexpr uint64_t P =
       (1ULL << 32) | (1ULL << 7) | (1ULL << 3) | (1ULL << 2) | (1ULL << 0);
-  constexpr uint64_t mu = P;
   uint64_t R = _mm_cvtsi128_si64(in);
-  uint64_t T1 = _mm_cvtsi128_si64(clmul(R >> 32, mu));
+  uint64_t T1 = _mm_cvtsi128_si64(clmul(R >> 32, P));
   uint64_t T2 = _mm_cvtsi128_si64(clmul(T1 >> 32, P));
-  return 0xFFFFFFFFULL & (R ^ T2);
+  return lower_mask & (R ^ T2);
 }
 uint64_t reduce_GF2_32(__m128i in) {
   // modulus = x^32 + x^7 + x^3 + x^2 + 1
@@ -69,6 +108,45 @@ uint64_t reduce_GF2_32(__m128i in) {
   return lower_mask & R_lower;
 }
 
+// actually this is slower than the above two :(
+uint64_t reduce_GF2_32_clmul(const __m128i in) {
+  // modulus = x^32 + x^7 + x^3 + x^2 + 1
+  __m128i p = _mm_set_epi64x(0x0, 0x8d);
+  __m128i mask = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000);
+  __m128i t;
+
+  __m128i hi = _mm_srli_si128(in, 4); // extracting the in_hi
+  __m128i low =
+      _mm_xor_si128(_mm_or_si128(in, mask), mask); // extracting the low
+
+  t = _mm_clmulepi64_si128(hi, p, 0x00); // in_hi_low(0x00) * p
+  t = _mm_xor_si128(t, low);             // 4 + 32 -> Length after xor
+
+  hi = _mm_srli_si128(t, 4);                        // extracting the t_hi
+  low = _mm_xor_si128(_mm_or_si128(t, mask), mask); // extracting the low
+
+  t = _mm_clmulepi64_si128(hi, p, 0x00); // t_hi_low(0x00) * p
+  t = _mm_xor_si128(t, low);             // 16 -> Length after xor
+
+  return _mm_extract_epi64(t, 0);
+}
+
+// actually a bit slowerthan naive version below on some CPU version perhaps ??
+uint64_t reduce_GF2_40_barret(__m128i in) {
+  // modulus = x^40 + x^5 + x^4 + x^3 + 1
+  constexpr uint64_t upper_mask = 0xFFFFULL;
+  constexpr uint64_t lower_mask = 0xFFFFFFFFFFULL;
+  constexpr uint64_t P =
+      (1ULL << 40) | (1ULL << 5) | (1ULL << 4) | (1ULL << 3) | (1ULL << 0);
+  uint64_t R_lower = _mm_extract_epi64(in, 0);
+  uint64_t R_upper =
+      ((_mm_extract_epi64(in, 1) & upper_mask) << 24) | (R_lower >> 40);
+  uint64_t T1 = _mm_cvtsi128_si64(clmul(R_upper, P));
+  T1 = ((_mm_extract_epi64(in, 1) & upper_mask) << 24) | (T1 >> 40);
+  uint64_t T2 = _mm_cvtsi128_si64(clmul(T1, P));
+  return lower_mask & (R_lower ^ T2);
+}
+
 uint64_t reduce_GF2_40(__m128i in) {
   // modulus = x^40 + x^5 + x^4 + x^3 + 1
   constexpr uint64_t upper_mask = 0xFFFFULL;
@@ -84,6 +162,44 @@ uint64_t reduce_GF2_40(__m128i in) {
   return lower_mask & R_lower;
 }
 
+uint64_t reduce_GF2_40_clmul(const __m128i in) {
+  // modulus = x^40 + x^5 + x^4 + x^3 + 1
+  __m128i p = _mm_set_epi64x(0x0, 0x39);
+  __m128i mask = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFFFF0000000000);
+  __m128i t;
+
+  __m128i hi = _mm_srli_si128(in, 5); // extracting the hi
+  __m128i low =
+      _mm_xor_si128(_mm_or_si128(in, mask), mask); // extracting the low
+
+  t = _mm_clmulepi64_si128(hi, p, 0x00); // hi_low(0x00) * p
+  t = _mm_xor_si128(t, low);             // 4 + 40 -> Length after xor
+
+  hi = _mm_srli_si128(t, 5);                        // extracting the hi
+  low = _mm_xor_si128(_mm_or_si128(t, mask), mask); // extracting the low
+
+  t = _mm_clmulepi64_si128(hi, p, 0x00); // hi_low(0x00) * p
+  t = _mm_xor_si128(t, low);             // 40 -> Length after xor
+
+  return _mm_extract_epi64(t, 0);
+}
+
+// actually a bit slowerthan naive version below on some CPU version perhaps ??
+uint64_t reduce_GF2_48_barret(__m128i in) {
+  // modulus = x^48 + x^5 + x^3 + x^2 + 1
+  constexpr uint64_t upper_mask = 0xFFFFFFFFULL;
+  constexpr uint64_t lower_mask = 0xFFFFFFFFFFFFULL;
+  constexpr uint64_t P =
+      (1ULL << 48) | (1ULL << 5) | (1ULL << 3) | (1ULL << 2) | (1ULL << 0);
+  uint64_t R_lower = _mm_extract_epi64(in, 0);
+  uint64_t R_upper =
+      ((_mm_extract_epi64(in, 1) & upper_mask) << 16) | (R_lower >> 48);
+  uint64_t T1 = _mm_cvtsi128_si64(clmul(R_upper, P));
+  T1 = ((_mm_extract_epi64(in, 1) & upper_mask) << 16) | (T1 >> 48);
+  uint64_t T2 = _mm_cvtsi128_si64(clmul(T1, P));
+  return lower_mask & (R_lower ^ T2);
+}
+
 uint64_t reduce_GF2_48(__m128i in) {
   // modulus = x^48 + x^5 + x^3 + x^2 + 1
   constexpr uint64_t upper_mask = 0xFFFFFFFFULL;
@@ -98,6 +214,29 @@ uint64_t reduce_GF2_48(__m128i in) {
   return lower_mask & R_lower;
 }
 
+// actually this is slower than the above two :(
+uint64_t reduce_GF2_48_clmul(const __m128i in) {
+  // modulus = x^48 + x^5 + x^3 + x^2 + 1
+  __m128i p = _mm_set_epi64x(0x0, 0x2d);
+  __m128i mask = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF000000000000);
+  __m128i t;
+
+  __m128i hi = _mm_srli_si128(in, 6); // extracting the hi
+  __m128i low =
+      _mm_xor_si128(_mm_or_si128(in, mask), mask); // extracting the low
+
+  t = _mm_clmulepi64_si128(hi, p, 0x00); // hi_low(0x00) * p
+  t = _mm_xor_si128(t, low);             // 4 + 48 -> Length after xor
+
+  hi = _mm_srli_si128(t, 6);                        // extracting the hi
+  low = _mm_xor_si128(_mm_or_si128(t, mask), mask); // extracting the low
+
+  t = _mm_clmulepi64_si128(hi, p, 0x00); // hi_low(0x00) * p
+  t = _mm_xor_si128(t, low);             // 48 -> Length after xor
+
+  return _mm_extract_epi64(t, 0);
+}
+
 uint64_t GF2_euclidean_div_quotient(uint64_t a, uint64_t b) {
   uint64_t quotient = 0;
   int diff = __builtin_clzl(b) - __builtin_clzl(a);
@@ -134,7 +273,9 @@ uint64_t mod_inverse(uint64_t a, uint64_t mod) {
 namespace field {
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wignored-attributes"
-std::function<uint64_t(__m128i)> GF2E::reduce = nullptr;
+uint64_t (*GF2E::reduce_naive)(__m128i);
+uint64_t (*GF2E::reduce_barret)(__m128i);
+uint64_t (*GF2E::reduce_clmul)(__m128i);
 #pragma GCC diagnostic pop
 size_t GF2E::byte_size = 0;
 uint64_t GF2E::modulus = 0;
@@ -154,10 +295,10 @@ GF2E &GF2E::operator-=(const GF2E &other) {
   return *this;
 }
 GF2E GF2E::operator*(const GF2E &other) const {
-  return GF2E(reduce(clmul(this->data, other.data)));
+  return GF2E(reduce_clmul(clmul(this->data, other.data)));
 }
 GF2E &GF2E::operator*=(const GF2E &other) {
-  this->data = reduce(clmul(this->data, other.data));
+  this->data = reduce_clmul(clmul(this->data, other.data));
   return *this;
 }
 bool GF2E::operator==(const GF2E &other) const {
@@ -167,18 +308,138 @@ bool GF2E::operator!=(const GF2E &other) const {
   return this->data != other.data;
 }
 
+std::ostream &operator<<(std::ostream &os, const GF2E &ele) {
+  os << "0x" << std::setfill('0') << std::hex << std::setw(8) << ele.get_data();
+  return os;
+}
+
 GF2E GF2E::inverse() const { return GF2E(mod_inverse(this->data, modulus)); }
 
+// This is actually slow than the other inverse inplementation but this has
+// constant time, so gets better with larger root size and field size
+GF2E GF2E::inverse_const_time() const {
+
+  switch (this->byte_size) {
+  case 2: {
+    constexpr uint64_t u[6] = {1, 2, 3, 6, 12, 15};
+    constexpr uint64_t u_len = sizeof(u) / sizeof(u[0]);
+    // q = u[i] - u[i - 1] should give us the corresponding values
+    // (1, 1, 3, 6, 3), which will have corresponding indexes
+    constexpr uint64_t q_index[u_len - 1] = {0, 0, 2, 3, 2};
+    GF2E b[u_len];
+    b[0] = this->data;
+    for (size_t i = 1; i < u_len; ++i) {
+      GF2E b_p = b[i - 1];
+      GF2E b_q = b[q_index[i - 1]];
+      for (uint64_t m = u[q_index[i - 1]]; m; --m) {
+        b_p = b_p.sqr();
+      }
+      b[i] = reduce_clmul(clmul(b_p.data, b_q.data));
+    }
+    return b[u_len - 1].sqr();
+    break;
+  }
+  case 4: {
+    constexpr uint64_t u[8] = {1, 2, 3, 5, 7, 14, 28, 31};
+    constexpr uint64_t u_len = sizeof(u) / sizeof(u[0]);
+    // q = u[i] - u[i - 1] should give us the corresponding values
+    // (1, 1, 2, 2, 7, 14, 3), which will have corresponding indexes
+    constexpr uint64_t q_index[u_len - 1] = {0, 0, 1, 1, 4, 5, 2};
+    GF2E b[u_len];
+    b[0] = this->data;
+    for (size_t i = 1; i < u_len; ++i) {
+      GF2E b_p = b[i - 1];
+      GF2E b_q = b[q_index[i - 1]];
+      for (uint64_t m = u[q_index[i - 1]]; m; --m) {
+        b_p = b_p.sqr();
+      }
+      b[i] = reduce_clmul(clmul(b_p.data, b_q.data));
+    }
+    return b[u_len - 1].sqr();
+    break;
+  }
+  case 5: {
+    constexpr uint64_t u[8] = {1, 2, 3, 6, 12, 24, 27, 39};
+    constexpr uint64_t u_len = sizeof(u) / sizeof(u[0]);
+    // q = u[i] - u[i - 1] should give us the corresponding values
+    // (1, 1, 3, 6, 12, 3, 12), which will have corresponding indexes
+    constexpr uint64_t q_index[u_len - 1] = {0, 0, 2, 3, 4, 2, 4};
+    GF2E b[u_len];
+    b[0] = this->data;
+    for (size_t i = 1; i < u_len; ++i) {
+      GF2E b_p = b[i - 1];
+      GF2E b_q = b[q_index[i - 1]];
+      for (uint64_t m = u[q_index[i - 1]]; m; --m) {
+        b_p = b_p.sqr();
+      }
+      b[i] = reduce_clmul(clmul(b_p.data, b_q.data));
+    }
+    return b[u_len - 1].sqr();
+    break;
+  }
+  case 6: {
+    constexpr uint64_t u[9] = {1, 2, 3, 5, 10, 20, 23, 46, 47};
+    constexpr uint64_t u_len = sizeof(u) / sizeof(u[0]);
+    // q = u[i] - u[i - 1] should give us the corresponding values
+    // (1, 1, 2, 5, 10, 3, 23, 1), which will have corresponding indexes
+    constexpr uint64_t q_index[u_len - 1] = {0, 0, 1, 3, 4, 2, 6, 0};
+    GF2E b[u_len];
+    b[0] = this->data;
+    for (size_t i = 1; i < u_len; ++i) {
+      GF2E b_p = b[i - 1];
+      GF2E b_q = b[q_index[i - 1]];
+      for (uint64_t m = u[q_index[i - 1]]; m; --m) {
+        b_p = b_p.sqr();
+      }
+      b[i] = reduce_clmul(clmul(b_p.data, b_q.data));
+    }
+    return b[u_len - 1].sqr();
+    break;
+  }
+  default:
+    return GF2E(this->data);
+    break;
+  }
+}
+
+// This is not faster than calling the operand because we are squaring numbers
+// in GF2_X where X is less than 64
+GF2E GF2E::sqr() const {
+
+  __m128i tmp[2];
+  __m128i res;
+
+  __m128i sqrT = _mm_set_epi64x(0x5554515045444140, 0x1514111005040100);
+  __m128i mask = _mm_set_epi64x(0, 0x0F0F0F0F0F0F0F0F);
+
+  __m128i a = _mm_set_epi64x(0, this->data);
+
+  tmp[0] = _mm_and_si128(a, mask);
+
+  tmp[1] = _mm_srli_epi64(a, 4);
+  tmp[1] = _mm_and_si128(tmp[1], mask);
+
+  tmp[0] = _mm_shuffle_epi8(sqrT, tmp[0]);
+  tmp[1] = _mm_shuffle_epi8(sqrT, tmp[1]);
+
+  res = _mm_unpacklo_epi8(tmp[0], tmp[1]);
+
+  return GF2E(reduce_clmul(res));
+}
+
 void GF2E::to_bytes(uint8_t *out) const {
   uint64_t be_data = htole64(data);
   memcpy(out, (uint8_t *)(&be_data), byte_size);
 }
+
 std::vector<uint8_t> GF2E::to_bytes() const {
   std::vector<uint8_t> buffer(byte_size);
   this->to_bytes(buffer.data());
   return buffer;
 }
 
+uint64_t GF2E::get_data() const { return this->data; }
+
 void GF2E::from_bytes(uint8_t *in) {
   data = 0;
   memcpy((uint8_t *)(&data), in, byte_size);
@@ -191,7 +452,9 @@ void GF2E::init_extension_field(const banquet_instance_t &instance) {
     // modulus = x^16 + x^5 + x^3 + x + 1
     modulus =
         (1ULL << 16) | (1ULL << 5) | (1ULL << 3) | (1ULL << 1) | (1ULL << 0);
-    reduce = reduce_GF2_16;
+    reduce_naive = reduce_GF2_16;
+    reduce_barret = reduce_GF2_16_barret;
+    reduce_clmul = reduce_GF2_16_clmul;
     byte_size = 2;
     // Ring morphism:
     //   From: Finite Field in x of size 2^8
@@ -212,7 +475,9 @@ void GF2E::init_extension_field(const banquet_instance_t &instance) {
     // modulus = x^32 + x^7 + x^3 + x^2 + 1
     modulus =
         (1ULL << 32) | (1ULL << 7) | (1ULL << 3) | (1ULL << 2) | (1ULL << 0);
-    reduce = reduce_GF2_32;
+    reduce_naive = reduce_GF2_32;
+    reduce_barret = reduce_GF2_32_barret;
+    reduce_clmul = reduce_GF2_32_clmul;
     byte_size = 4;
     // Ring morphism:
     //   From: Finite Field in x of size 2^8
@@ -241,7 +506,9 @@ void GF2E::init_extension_field(const banquet_instance_t &instance) {
     // modulus = x^40 + x^5 + x^4 + x^3 + 1
     modulus =
         (1ULL << 40) | (1ULL << 5) | (1ULL << 4) | (1ULL << 3) | (1ULL << 0);
-    reduce = reduce_GF2_40;
+    reduce_naive = reduce_GF2_40;
+    reduce_barret = reduce_GF2_40_barret;
+    reduce_clmul = reduce_GF2_40_clmul;
     byte_size = 5;
     // Ring morphism:
     //   From: Finite Field in x of size 2^8
@@ -269,7 +536,9 @@ void GF2E::init_extension_field(const banquet_instance_t &instance) {
     // modulus = x^48 + x^5 + x^3 + x^2 + 1
     modulus =
         (1ULL << 48) | (1ULL << 5) | (1ULL << 3) | (1ULL << 2) | (1ULL << 0);
-    reduce = reduce_GF2_48;
+    reduce_naive = reduce_GF2_48;
+    reduce_barret = reduce_GF2_48_barret;
+    reduce_clmul = reduce_GF2_48_clmul;
     byte_size = 6;
     // Ring morphism:
     //   From: Finite Field in x of size 2^8
@@ -313,20 +582,82 @@ void GF2E::init_extension_field(const banquet_instance_t &instance) {
 
 const GF2E &lift_uint8_t(uint8_t value) { return lifting_lut[value]; }
 
-std::vector<GF2E> get_first_n_field_elements(size_t n) {
-  std::vector<GF2E> result;
-  result.reserve(n);
-  GF2E x(2);
-  GF2E gen = x;
-  for (size_t i = 0; i < n; i++) {
-    result.push_back(gen);
-    gen = gen * x;
+// Use to precompute the constants of the denominaotr.inverse()
+std::vector<GF2E> precompute_denominator(const std::vector<GF2E> &x_values) {
+  // Check if value size is power of 2
+  if (ceil(log2(x_values.size())) != floor(log2(x_values.size()))) {
+    throw std::runtime_error("invalid sizes for interpolation");
   }
-  return result;
+  size_t values_size = x_values.size();
+  std::vector<GF2E> precomputed_denominator;
+  precomputed_denominator.reserve(values_size);
+  GF2E denominator;
+
+  for (size_t k = 0; k < values_size; ++k) {
+    denominator = GF2E(1);
+    for (size_t i = 0; i < values_size; ++i) {
+      if (i != k) {
+        denominator *= x_values[k] - x_values[i];
+      }
+    }
+    precomputed_denominator.push_back(denominator.inverse());
+  }
+
+  return precomputed_denominator;
+}
+
+void set_x_minus_xi_poly_size(
+    std::vector<std::vector<GF2E>> &precomputed_x_minus_xi, size_t root_count) {
+  size_t split_count = 0;
+  for (size_t i = root_count; i > 1; i /= 2) {
+    split_count += i;
+  }
+  precomputed_x_minus_xi.reserve(split_count);
+}
+
+// Use to precompute x - xi recurssively
+void precompute_x_minus_xi_poly_splits(
+    const std::vector<GF2E> &x_values,
+    std::vector<std::vector<GF2E>> &precomputed_x_minus_xi) {
+
+  size_t len = x_values.size();
+  if (len == 1) {
+    return;
+  }
+  size_t len_half = len / 2;
+
+  // Gets the first half roots
+  std::vector<GF2E> x_first_half_roots;
+  x_first_half_roots.reserve(len / 2);
+  for (size_t i = 0; i < len_half; ++i) {
+    x_first_half_roots.push_back(x_values[i]);
+  }
+  // Generates poly from roots
+  std::vector<GF2E> x_first_half_poly = build_from_roots(x_first_half_roots);
+  // Save poly
+  precomputed_x_minus_xi.push_back(x_first_half_poly);
+  // Recurssion with the first half roots
+  precompute_x_minus_xi_poly_splits(x_first_half_roots, precomputed_x_minus_xi);
+
+  // Gets the second half roots
+  std::vector<GF2E> x_second_half_roots;
+  x_second_half_roots.reserve(len / 2);
+  for (size_t i = len_half; i < len; ++i) {
+    x_second_half_roots.push_back(x_values[i]);
+  }
+  // Generates poly from roots
+  std::vector<GF2E> x_second_half_poly = build_from_roots(x_second_half_roots);
+  // Save poly
+  precomputed_x_minus_xi.push_back(x_second_half_poly);
+  // Recurssion with the second half roots
+  precompute_x_minus_xi_poly_splits(x_second_half_roots,
+                                    precomputed_x_minus_xi);
 }
 
+// Computing the precomputable part of the plain langrange interpolation
+// (not-optimized)
 std::vector<std::vector<GF2E>>
-precompute_lagrange_polynomials(const std::vector<GF2E> &x_values) {
+precompute_lagrange_polynomials_slow(const std::vector<GF2E> &x_values) {
   size_t m = x_values.size();
   std::vector<std::vector<GF2E>> precomputed_lagrange_polynomials;
   precomputed_lagrange_polynomials.reserve(m);
@@ -352,6 +683,28 @@ precompute_lagrange_polynomials(const std::vector<GF2E> &x_values) {
   return precomputed_lagrange_polynomials;
 }
 
+// Computing the precomputable part of the plain langrange interpolation
+// (optimized) (fast only with large root sizes)
+std::vector<std::vector<GF2E>>
+precompute_lagrange_polynomials(const std::vector<GF2E> &x_values) {
+
+  size_t m = x_values.size();
+  std::vector<std::vector<GF2E>> precomputed_lagrange_polynomials;
+  precomputed_lagrange_polynomials.reserve(m);
+
+  std::vector<GF2E> x_minus_xi = build_from_roots(x_values);
+  for (size_t k = 0; k < m; ++k) {
+
+    std::vector<GF2E> numerator = x_minus_xi / x_values[k];
+
+    precomputed_lagrange_polynomials.push_back(
+        eval(numerator, x_values[k]).inverse() * numerator);
+  }
+
+  return precomputed_lagrange_polynomials;
+}
+
+// Langrange interpolation with precomputation (slow)
 std::vector<GF2E> interpolate_with_precomputation(
     const std::vector<std::vector<GF2E>> &precomputed_lagrange_polynomials,
     const std::vector<GF2E> &y_values) {
@@ -367,6 +720,71 @@ std::vector<GF2E> interpolate_with_precomputation(
   return res;
 }
 
+// Langrange interpolation with precomputation (fast)
+std::vector<GF2E> interpolate_with_precomputation(
+    const std::vector<GF2E> &precomputed_denominator,
+    const std::vector<GF2E> &y_values, const size_t index) {
+
+  std::vector<GF2E> a_precomputed_denominator;
+  a_precomputed_denominator.reserve(1);
+  a_precomputed_denominator.push_back(precomputed_denominator[index]);
+
+  return y_values[index] * a_precomputed_denominator;
+}
+
+// Langrange interpolation using recurssion (fast)
+std::vector<GF2E> interpolate_with_recurrsion(
+    const std::vector<GF2E> &y_values,
+    const std::vector<GF2E> &precomputed_denominator,
+    const std::vector<std::vector<GF2E>> &precomputed_x_minus_xi,
+    const size_t x_start_index, const size_t x_length,
+    const size_t x_minus_xi_first_index, const size_t x_minus_xi_length) {
+
+  // For indexing x_values
+  const size_t x_len_half = x_length / 2;
+  const size_t x_end_index = x_start_index + x_length - 1;
+  const size_t x_second_half_start_index = x_start_index + x_len_half;
+
+  // For indexing x_minus_xi values
+  const size_t x_minus_xi_half_length = x_minus_xi_length / 2;
+  const size_t x_minus_xi_second_index =
+      x_minus_xi_half_length + x_minus_xi_first_index;
+
+  // The recurssion part !!
+  if (x_length != 2) {
+
+    return (precomputed_x_minus_xi[x_minus_xi_second_index] *
+            interpolate_with_recurrsion(y_values, precomputed_denominator,
+                                        precomputed_x_minus_xi, x_start_index,
+                                        x_len_half, x_minus_xi_first_index + 1,
+                                        x_minus_xi_half_length - 1)) +
+           (precomputed_x_minus_xi[x_minus_xi_first_index] *
+            interpolate_with_recurrsion(
+                y_values, precomputed_denominator, precomputed_x_minus_xi,
+                x_second_half_start_index, x_len_half,
+                x_minus_xi_second_index + 1, x_minus_xi_half_length - 1));
+  }
+
+  return (precomputed_x_minus_xi[x_minus_xi_second_index] *
+          interpolate_with_precomputation(precomputed_denominator, y_values,
+                                          x_start_index)) +
+         (precomputed_x_minus_xi[x_minus_xi_first_index] *
+          interpolate_with_precomputation(precomputed_denominator, y_values,
+                                          x_end_index));
+}
+
+std::vector<GF2E> get_first_n_field_elements(size_t n) {
+  std::vector<GF2E> result;
+  result.reserve(n);
+  GF2E x(2);
+  GF2E gen = x;
+  for (size_t i = 0; i < n; i++) {
+    result.push_back(gen);
+    gen = gen * x;
+  }
+  return result;
+}
+
 std::vector<GF2E> build_from_roots(const std::vector<GF2E> &roots) {
   size_t len = roots.size();
 
@@ -437,7 +855,7 @@ field::GF2E dot_product(const std::vector<field::GF2E> &lhs,
   for (size_t i = 0; i < lhs.size(); i++)
     accum = _mm_xor_si128(accum, clmul(lhs[i].data, rhs[i].data));
 
-  field::GF2E result(field::GF2E::reduce(accum));
+  field::GF2E result(field::GF2E::reduce_clmul(accum));
   return result;
 }
 
@@ -464,5 +882,24 @@ std::vector<field::GF2E> operator*(const std::vector<field::GF2E> &lhs,
     for (size_t j = 0; j < rhs.size(); j++)
       result[i + j] += lhs[i] * rhs[j];
 
+  return result;
+}
+
+// polynomial division of x^n*c^n + x^n-1*c^n-1 + .... by x - a
+std::vector<field::GF2E> operator/(const std::vector<field::GF2E> &lhs,
+                                   const field::GF2E &rhs) {
+  std::vector<field::GF2E> temp(lhs);
+  size_t end_index = temp.size() - 1;
+
+  std::vector<field::GF2E> result;
+  result.reserve(end_index);
+
+  for (size_t i = end_index; i; --i) {
+    temp[i - 1] -= temp[i] * rhs;
+  }
+  for (size_t i = 1; i <= end_index; ++i) {
+    result.push_back(temp[i]);
+  }
+
   return result;
 }
\ No newline at end of file
diff --git a/field.h b/field.h
index a489872..968653b 100644
--- a/field.h
+++ b/field.h
@@ -3,7 +3,10 @@
 #include "banquet_instances.h"
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <functional>
+#include <iomanip>
+#include <iostream>
 extern "C" {
 #include <smmintrin.h>
 #include <wmmintrin.h>
@@ -22,7 +25,9 @@ class GF2E {
   uint64_t data;
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wignored-attributes"
-  static std::function<uint64_t(__m128i)> reduce;
+  static uint64_t (*reduce_naive)(__m128i);
+  static uint64_t (*reduce_barret)(__m128i);
+  static uint64_t (*reduce_clmul)(__m128i);
 #pragma GCC diagnostic pop
   static size_t byte_size;
   static uint64_t modulus;
@@ -47,6 +52,10 @@ class GF2E {
 
   GF2E inverse() const;
 
+  GF2E sqr() const;
+
+  GF2E inverse_const_time() const;
+
   void to_bytes(uint8_t *out) const;
   std::vector<uint8_t> to_bytes() const;
   void from_bytes(uint8_t *in);
@@ -54,13 +63,61 @@ class GF2E {
 
   friend GF2E(::dot_product)(const std::vector<field::GF2E> &lhs,
                              const std::vector<field::GF2E> &rhs);
+
+  GF2E(std::string hex_string) {
+    // check if hex_string start with 0x or 0X
+    if (hex_string.rfind("0x", 0) == 0 || hex_string.rfind("0X", 0) == 0) {
+      hex_string = hex_string.substr(2);
+    } else {
+      throw std::runtime_error("input needs to be a hex number");
+    }
+    constexpr size_t num_hex_chars = 64 / 4;
+    if (hex_string.length() > num_hex_chars)
+      throw std::runtime_error("input hex is too large");
+    // pad to 128 bit
+    hex_string.insert(hex_string.begin(), num_hex_chars - hex_string.length(),
+                      '0');
+
+    data = std::stoull(hex_string.substr(0, 64 / 4), nullptr, 16);
+  }
+
+  uint64_t get_data() const;
 };
 
+std::ostream &operator<<(std::ostream &os, const GF2E &ele);
 const GF2E &lift_uint8_t(uint8_t value);
 
-std::vector<GF2E> get_first_n_field_elements(size_t n);
+std::vector<GF2E> precompute_denominator(const std::vector<GF2E> &x_values);
+
+void set_x_minus_xi_poly_size(
+    std::vector<std::vector<GF2E>> &precomputed_x_minus_xi, size_t root_count);
+
+void precompute_x_minus_xi_poly_splits(
+    const std::vector<GF2E> &x_values,
+    std::vector<std::vector<GF2E>> &precomputed_x_minus_xi);
+
+std::vector<std::vector<GF2E>>
+precompute_lagrange_polynomials_slow(const std::vector<GF2E> &x_values);
+
 std::vector<std::vector<GF2E>>
 precompute_lagrange_polynomials(const std::vector<GF2E> &x_values);
+
+std::vector<GF2E> interpolate_with_precomputation(
+    const std::vector<std::vector<GF2E>> &precomputed_lagrange_polynomials,
+    const std::vector<GF2E> &y_values);
+
+std::vector<GF2E> interpolate_with_precomputation(
+    const std::vector<GF2E> &precomputed_denominator,
+    const std::vector<GF2E> &y_values, const size_t index);
+
+std::vector<GF2E> interpolate_with_recurrsion(
+    const std::vector<GF2E> &y_values,
+    const std::vector<GF2E> &precomputed_denominator,
+    const std::vector<std::vector<GF2E>> &precomputed_x_minus_xi,
+    const size_t x_start_index, const size_t x_length,
+    const size_t x_minus_xi_first_index, const size_t x_minus_xi_length);
+
+std::vector<GF2E> get_first_n_field_elements(size_t n);
 std::vector<GF2E> interpolate_with_precomputation(
     const std::vector<std::vector<GF2E>> &precomputed_lagrange_polynomials,
     const std::vector<GF2E> &y_values);
@@ -78,4 +135,6 @@ std::vector<field::GF2E> operator*(const std::vector<field::GF2E> &lhs,
 std::vector<field::GF2E> operator*(const field::GF2E &lhs,
                                    const std::vector<field::GF2E> &rhs);
 std::vector<field::GF2E> operator*(const std::vector<field::GF2E> &lhs,
-                                   const std::vector<field::GF2E> &rhs);
\ No newline at end of file
+                                   const std::vector<field::GF2E> &rhs);
+std::vector<field::GF2E> operator/(const std::vector<field::GF2E> &lhs,
+                                   const field::GF2E &rhs);
\ No newline at end of file
diff --git a/tests/field_test.cpp b/tests/field_test.cpp
index 1d86ba5..3d134b0 100644
--- a/tests/field_test.cpp
+++ b/tests/field_test.cpp
@@ -67,7 +67,9 @@ TEST_CASE("Modular Arithmetic GF(2^32)", "[field]") {
   field::GF2E b_2(234130046);
   REQUIRE(a * b == ab);
   REQUIRE(a * a == a_2);
+  REQUIRE(a.sqr() == a_2);
   REQUIRE(b * b == b_2);
+  REQUIRE(b.sqr() == b_2);
 }
 
 TEST_CASE("Modular Arithmetic GF(2^40)", "[field]") {
@@ -79,6 +81,7 @@ TEST_CASE("Modular Arithmetic GF(2^40)", "[field]") {
   field::GF2E b_2(153370336291ULL);
   REQUIRE(a * b == ab);
   REQUIRE(a * a == a_2);
+  REQUIRE(a.sqr() == a_2);
   REQUIRE(b * b == b_2);
 }
 
@@ -91,6 +94,7 @@ TEST_CASE("Modular Arithmetic GF(2^48)", "[field]") {
   field::GF2E b_2(119412372920018ULL);
   REQUIRE(a * b == ab);
   REQUIRE(a * a == a_2);
+  REQUIRE(a.sqr() == a_2);
   REQUIRE(b * b == b_2);
 }
 
@@ -229,15 +233,32 @@ TEST_CASE("NTL inverse == custom", "[field]") {
 
     field::GF2E b = a.inverse();
     field::GF2E c = utils::ntl_to_custom(inv(utils::custom_to_ntl(a)));
-    std::cout << utils::custom_to_ntl(a) << ", " << utils::custom_to_ntl(b)
-              << ", " << utils::custom_to_ntl(c) << "\n";
-    std::cout << utils::custom_to_ntl(a * b) << ", "
-              << utils::custom_to_ntl(a * c) << ", "
-              << utils::custom_to_ntl(a) * utils::custom_to_ntl(c) << "\n";
     REQUIRE(b == c);
     REQUIRE(a * b == field::GF2E(1));
   }
 }
+
+TEST_CASE("Constant time inverse == custom", "[field]") {
+
+  // Checking for GF2_32
+  field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param1));
+  field::GF2E b;
+  b.set_coeff(31);
+  REQUIRE(b.inverse_const_time() == b.inverse());
+
+  // Checking for GF2_40
+  field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param3));
+  field::GF2E c;
+  c.set_coeff(39);
+  REQUIRE(c.inverse_const_time() == c.inverse());
+
+  // Checking for GF2_48
+  field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param4));
+  field::GF2E d;
+  d.set_coeff(47);
+  REQUIRE(d.inverse_const_time() == d.inverse());
+}
+
 TEST_CASE("NTL interpolation == custom", "[field]") {
   field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param1));
   utils::init_extension_field(banquet_instance_get(Banquet_L1_Param1));
@@ -255,7 +276,7 @@ TEST_CASE("NTL interpolation == custom", "[field]") {
   }
 
   std::vector<std::vector<field::GF2E>> a_lag =
-      field::precompute_lagrange_polynomials(a);
+      field::precompute_lagrange_polynomials_slow(a);
   std::vector<GF2EX> b_lag = utils::precompute_lagrange_polynomials(b);
 
   REQUIRE(a_lag.size() == b_lag.size());
@@ -265,4 +286,57 @@ TEST_CASE("NTL interpolation == custom", "[field]") {
       REQUIRE(a_lag[i][j] == utils::ntl_to_custom(b_lag[i][j]));
     }
   }
+}
+
+TEST_CASE("optmized custom == custom interpolation", "[field]") {
+  field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param1));
+  const size_t ROOT_SIZE = 128;
+
+  std::vector<field::GF2E> x = field::get_first_n_field_elements(ROOT_SIZE);
+  std::vector<field::GF2E> y = field::get_first_n_field_elements(ROOT_SIZE);
+  std::vector<std::vector<field::GF2E>> a_lag =
+      field::precompute_lagrange_polynomials_slow(x);
+  std::vector<field::GF2E> result =
+      field::interpolate_with_precomputation(a_lag, y);
+
+  std::vector<field::GF2E> x_opti =
+      field::get_first_n_field_elements(ROOT_SIZE);
+  std::vector<field::GF2E> y_opti =
+      field::get_first_n_field_elements(ROOT_SIZE);
+  std::vector<std::vector<field::GF2E>> x_lag =
+      field::precompute_lagrange_polynomials(x_opti);
+  std::vector<field::GF2E> result_optim =
+      field::interpolate_with_precomputation(x_lag, y_opti);
+
+  REQUIRE(result == result_optim);
+}
+
+TEST_CASE("fast interpolation == optmized custom interpolation", "[field]") {
+  field::GF2E::init_extension_field(banquet_instance_get(Banquet_L1_Param1));
+  const size_t ROOT_SIZE = 128;
+
+  std::vector<field::GF2E> x_opti =
+      field::get_first_n_field_elements(ROOT_SIZE);
+  std::vector<field::GF2E> y_opti =
+      field::get_first_n_field_elements(ROOT_SIZE);
+  std::vector<std::vector<field::GF2E>> x_lag =
+      field::precompute_lagrange_polynomials(x_opti);
+  std::vector<field::GF2E> result_optim =
+      field::interpolate_with_precomputation(x_lag, y_opti);
+
+  std::vector<field::GF2E> x_fast =
+      field::get_first_n_field_elements(ROOT_SIZE);
+  std::vector<field::GF2E> y_fast =
+      field::get_first_n_field_elements(ROOT_SIZE);
+  std::vector<field::GF2E> precomputed_denominator =
+      field::precompute_denominator(x_fast);
+  std::vector<std::vector<field::GF2E>> precomputed_x_minus_xi;
+  field::set_x_minus_xi_poly_size(precomputed_x_minus_xi, x_fast.size());
+  field::precompute_x_minus_xi_poly_splits(x_fast, precomputed_x_minus_xi);
+
+  std::vector<field::GF2E> result_fast = field::interpolate_with_recurrsion(
+      y_fast, precomputed_denominator, precomputed_x_minus_xi, 0, x_fast.size(),
+      0, precomputed_x_minus_xi.size());
+
+  REQUIRE(result_fast == result_optim);
 }
\ No newline at end of file