diff --git a/src/solvers/Makefile b/src/solvers/Makefile
index 71c7a01b3ed..d8c790ae8db 100644
--- a/src/solvers/Makefile
+++ b/src/solvers/Makefile
@@ -118,6 +118,7 @@ SRC = $(BOOLEFORCE_SRC) \
       flattening/boolbv_onehot.cpp \
       flattening/boolbv_overflow.cpp \
       flattening/boolbv_power.cpp \
+      flattening/boolbv_popcount.cpp \
       flattening/boolbv_quantifier.cpp \
       flattening/boolbv_reduction.cpp \
       flattening/boolbv_replication.cpp \
diff --git a/src/solvers/flattening/boolbv.cpp b/src/solvers/flattening/boolbv.cpp
index b94cd6db0c3..24077af0a65 100644
--- a/src/solvers/flattening/boolbv.cpp
+++ b/src/solvers/flattening/boolbv.cpp
@@ -235,7 +235,7 @@ bvt boolbvt::convert_bitvector(const exprt &expr)
   else if(expr.id()==ID_power)
      return convert_power(to_binary_expr(expr));
   else if(expr.id() == ID_popcount)
-    return convert_bv(simplify_expr(to_popcount_expr(expr).lower(), ns));
+    return convert_popcount(to_popcount_expr(expr));
   else if(expr.id() == ID_count_leading_zeros)
   {
     return convert_bv(
@@ -407,7 +407,7 @@ literalt boolbvt::convert_rest(const exprt &expr)
     CHECK_RETURN(!bv.empty());
     const irep_idt type_id = op.type().id();
     if(type_id == ID_signedbv || type_id == ID_fixedbv || type_id == ID_floatbv)
-      return bv[bv.size()-1];
+      return bv_utils.sign_bit(bv);
     if(type_id == ID_unsignedbv)
       return const_literal(false);
   }
diff --git a/src/solvers/flattening/boolbv.h b/src/solvers/flattening/boolbv.h
index 62d2703ee10..c305b555ee0 100644
--- a/src/solvers/flattening/boolbv.h
+++ b/src/solvers/flattening/boolbv.h
@@ -37,6 +37,7 @@ class floatbv_round_to_integral_exprt;
 class floatbv_typecast_exprt;
 class ieee_float_op_exprt;
 class overflow_result_exprt;
+class popcount_exprt;
 class replication_exprt;
 class unary_overflow_exprt;
 class union_typet;
@@ -203,6 +204,7 @@ class boolbvt:public arrayst
   virtual bvt convert_bitreverse(const bitreverse_exprt &expr);
   virtual bvt convert_saturating_add_sub(const binary_exprt &expr);
   virtual bvt convert_overflow_result(const overflow_result_exprt &expr);
+  virtual bvt convert_popcount(const popcount_exprt &expr);
 
   bvt convert_update_bits(bvt src, const exprt &index, const bvt &new_value);
 
diff --git a/src/solvers/flattening/boolbv_popcount.cpp b/src/solvers/flattening/boolbv_popcount.cpp
new file mode 100644
index 00000000000..95a483facac
--- /dev/null
+++ b/src/solvers/flattening/boolbv_popcount.cpp
@@ -0,0 +1,20 @@
+/*******************************************************************\
+
+Module:
+
+Author: Michael Tautschnig
+
+\*******************************************************************/
+
+#include <util/bitvector_expr.h>
+
+#include "boolbv.h"
+
+bvt boolbvt::convert_popcount(const popcount_exprt &expr)
+{
+  const std::size_t width = boolbv_width(expr.type());
+
+  bvt op = convert_bv(expr.op());
+
+  return bv_utils.zero_extension(bv_utils.popcount(op), width);
+}
diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index 31dd12fea20..fe56e83db72 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -8,9 +8,40 @@ Author: Daniel Kroening, kroening@kroening.com
 
 #include "bv_utils.h"
 
+#include <util/arith_tools.h>
+
+#include <climits>
+#include <iostream>
 #include <list>
 #include <utility>
 
+static std::string beautify(const bvt &bv)
+{
+  for(const auto &v : bv)
+  {
+    if(!v.is_constant())
+    {
+      std::ostringstream oss;
+      oss << bv;
+      return oss.str();
+    }
+  }
+
+  std::string result;
+  std::size_t number = 0;
+  for(std::size_t i = 0; i < bv.size(); ++i)
+  {
+    if(result.size() % 5 == 4)
+      result = std::string(" ") + result;
+    result = std::string(bv[i].is_false() ? "0" : "1") + result;
+
+    if(bv[i].is_true())
+      number += 1 << i;
+  }
+
+  return result + " (" + std::to_string(number) + ")";
+}
+
 bvt bv_utilst::build_constant(const mp_integer &n, std::size_t width)
 {
   std::string n_str=integer2binary(n, width);
@@ -432,12 +463,11 @@ literalt bv_utilst::overflow_add(
     // An overflow occurs if the signs of the two operands are the same
     // and the sign of the sum is the opposite.
 
-    literalt old_sign=op0[op0.size()-1];
-    literalt sign_the_same=prop.lequal(op0[op0.size()-1], op1[op1.size()-1]);
+    literalt old_sign = sign_bit(op0);
+    literalt sign_the_same = prop.lequal(sign_bit(op0), sign_bit(op1));
 
     bvt result=add(op0, op1);
-    return
-      prop.land(sign_the_same, prop.lxor(result[result.size()-1], old_sign));
+    return prop.land(sign_the_same, prop.lxor(sign_bit(result), old_sign));
   }
   else if(rep==representationt::UNSIGNED)
   {
@@ -457,7 +487,7 @@ literalt bv_utilst::overflow_sub(
     // x is negative, always representable, and
     // thus not an overflow.
     literalt op1_is_int_min=is_int_min(op1);
-    literalt op0_is_negative=op0[op0.size()-1];
+    literalt op0_is_negative = sign_bit(op0);
 
     return
       prop.lselect(op1_is_int_min,
@@ -557,7 +587,7 @@ bvt bv_utilst::shift(const bvt &src, const shiftt s, std::size_t dist)
     case shiftt::SHIFT_ARIGHT:
       // src.size()-i won't underflow as i<src.size()
       // Then, if dist<src.size()-i, then i+dist<src.size()
-      l=(dist<src.size()-i?src[i+dist]:src[src.size()-1]); // sign bit
+      l = dist < src.size() - i ? src[i + dist] : sign_bit(src);
       break;
 
     case shiftt::SHIFT_LRIGHT:
@@ -608,7 +638,7 @@ literalt bv_utilst::overflow_negate(const bvt &bv)
   bvt should_be_zeros(bv);
   should_be_zeros.pop_back();
 
-  return prop.land(bv[bv.size() - 1], !prop.lor(should_be_zeros));
+  return prop.land(sign_bit(bv), !prop.lor(should_be_zeros));
 }
 
 void bv_utilst::incrementer(
@@ -776,12 +806,55 @@ bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
   return add(a, b);
 }
 
+bvt bv_utilst::comba_column_wise(const std::vector<bvt> &pps)
+{
+  PRECONDITION(!pps.empty());
+
+  std::vector<bvt> columns(pps.front().size());
+  for(const auto &pp : pps)
+  {
+    PRECONDITION(pp.size() == pps.front().size());
+    for(std::size_t i = 0; i < pp.size(); ++i)
+    {
+      if(!pp[i].is_false())
+        columns[i].push_back(pp[i]);
+    }
+  }
+
+  bvt result;
+  result.reserve(columns.size());
+
+  for(std::size_t i = 0; i < columns.size(); ++i)
+  {
+    const bvt &column = columns[i];
+
+    if(column.empty())
+      result.push_back(const_literal(false));
+    else
+    {
+      bvt column_sum = popcount(column);
+      CHECK_RETURN(!column_sum.empty());
+      result.push_back(column_sum.front());
+      for(std::size_t j = 1; j < column_sum.size(); ++j)
+      {
+        if(i + j >= columns.size())
+          break;
+        if(!column_sum[j].is_false())
+          columns[i + j].push_back(column_sum[j]);
+      }
+    }
+  }
+
+  return result;
+}
+
 // Wallace tree multiplier. This is disabled, as runtimes have
 // been observed to go up by 5%-10%, and on some models even by 20%.
 // #define WALLACE_TREE
 // Dadda' reduction scheme. This yields a smaller formula size than Wallace
-// trees (and also the default addition scheme), but remains disabled as it
-// isn't consistently more performant either.
+// trees (and also the default addition scheme), but isn't consistently more
+// performant with simple partial-product generation. Only when using
+// higher-radix multipliers the combination appears to perform better.
 // #define DADDA_TREE
 
 // The following examples demonstrate the performance differences (with a
@@ -916,16 +989,89 @@ bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
 // our multiplier that's not using a tree reduction scheme, but aren't uniformly
 // better either.
 
+// Higher radix multipliers pre-compute partial products for groups of bits:
+// radix-4 are groups of 2 bits, radix-8 are groups of 3 bits, and radix-16 are
+// groups of 4 bits. Performance data for these variants combined with different
+// (tree) reduction schemes are recorded at
+// https://tinyurl.com/multiplier-comparison. The data suggests that radix-8
+// with Dadda's reduction yields the most consistent performance improvement
+// while not regressing substantially in the matrix of different benchmarks and
+// CaDiCaL and MiniSat2 as solvers.
+// #define RADIX_MULTIPLIER 8
+// #define USE_KARATSUBA
+// #define USE_TOOM_COOK
+#define USE_SCHOENHAGE_STRASSEN
+#ifdef RADIX_MULTIPLIER
+//#  define COMBA
+#  define DADDA_TREE
+#endif
+// #define COMBA
+
+#ifdef RADIX_MULTIPLIER
+static bvt unsigned_multiply_by_3(propt &prop, const bvt &op)
+{
+  PRECONDITION(prop.cnf_handled_well());
+  PRECONDITION(!op.empty());
+
+  bvt result;
+  result.reserve(op.size());
+
+  result.push_back(op[0]);
+  literalt prev_bit = const_literal(false);
+
+  for(std::size_t i = 1; i < op.size(); ++i)
+  {
+    literalt sum = prop.new_variable();
+
+    prop.lcnf({sum, !op[i - 1], !op[i], !prev_bit});
+    prop.lcnf({sum, !op[i - 1], !op[i], result.back()});
+    prop.lcnf({sum, op[i - 1], op[i], !prev_bit, result.back()});
+    prop.lcnf({sum, !op[i - 1], op[i], prev_bit, !result.back()});
+    prop.lcnf({sum, op[i - 1], !op[i], !result.back()});
+    prop.lcnf({sum, op[i - 1], !op[i], prev_bit});
+
+    prop.lcnf({!sum, !op[i - 1], op[i], !prev_bit});
+    prop.lcnf({!sum, !op[i - 1], op[i], result.back()});
+    prop.lcnf({!sum, !op[i - 1], !op[i], prev_bit, !result.back()});
+
+    prop.lcnf({!sum, op[i - 1], op[i], !result.back()});
+    prop.lcnf({!sum, op[i - 1], op[i], prev_bit});
+    prop.lcnf({!sum, op[i - 1], !op[i], !prev_bit, result.back()});
+
+    prop.lcnf({!sum, op[i], prev_bit, result.back()});
+    prop.lcnf({!sum, op[i], !prev_bit, !result.back()});
+
+    result.push_back(sum);
+    prev_bit = op[i - 1];
+  }
+
+  return result;
+}
+#endif
+
 bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
 {
-  bvt op0=_op0, op1=_op1;
+  PRECONDITION(!_op0.empty());
+  PRECONDITION(!_op1.empty());
 
-  if(is_constant(op1))
-    std::swap(op0, op1);
+  if(_op1.size() == 1)
+  {
+    bvt product;
+    product.reserve(_op0.size());
+    for(const auto &lit : _op0)
+      product.push_back(prop.land(lit, _op1.front()));
+    return product;
+  }
 
-  // build the usual quadratic number of partial products
+  // store partial products
   std::vector<bvt> pps;
-  pps.reserve(op0.size());
+  pps.reserve(_op0.size());
+
+  bvt op0 = _op0, op1 = _op1;
+
+#ifndef RADIX_MULTIPLIER
+  if(is_constant(op1))
+    std::swap(op0, op1);
 
   for(std::size_t bit=0; bit<op0.size(); bit++)
   {
@@ -941,6 +1087,804 @@ bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
 
     pps.push_back(pp);
   }
+#else
+#  if RADIX_MULTIPLIER == 4
+#    define RADIX_GROUP_SIZE 2
+#  elif RADIX_MULTIPLIER == 8
+#    define RADIX_GROUP_SIZE 3
+#  elif RADIX_MULTIPLIER == 16
+#    define RADIX_GROUP_SIZE 4
+#  else
+#    error Unsupported radix
+#  endif
+  if(is_constant(op0) && !is_constant(op1))
+    std::swap(op0, op1);
+
+  std::optional<bvt> times_three_opt;
+  auto times_three = [this, &times_three_opt, &op0]() -> const bvt &
+  {
+    if(!times_three_opt.has_value())
+    {
+#  if 1
+      if(prop.cnf_handled_well())
+        times_three_opt = unsigned_multiply_by_3(prop, op0);
+      else
+#  endif
+        times_three_opt = add(op0, shift(op0, shiftt::SHIFT_LEFT, 1));
+    }
+    return *times_three_opt;
+  };
+
+#  if RADIX_MULTIPLIER >= 8
+  std::optional<bvt> times_five_opt, times_seven_opt;
+  auto times_five = [this, &times_five_opt, &op0]() -> const bvt &
+  {
+    if(!times_five_opt.has_value())
+      times_five_opt = add(op0, shift(op0, shiftt::SHIFT_LEFT, 2));
+    return *times_five_opt;
+  };
+  auto times_seven =
+    [this, &times_seven_opt, &op0, &times_three]() -> const bvt &
+  {
+    if(!times_seven_opt.has_value())
+      times_seven_opt = add(times_three(), shift(op0, shiftt::SHIFT_LEFT, 2));
+    return *times_seven_opt;
+  };
+#  endif
+
+#  if RADIX_MULTIPLIER == 16
+  std::optional<bvt> times_nine_opt, times_eleven_opt, times_thirteen_opt,
+    times_fifteen_opt;
+  auto times_nine = [this, &times_nine_opt, &op0]() -> const bvt &
+  {
+    if(!times_nine_opt.has_value())
+      times_nine_opt = add(op0, shift(op0, shiftt::SHIFT_LEFT, 3));
+    return *times_nine_opt;
+  };
+  auto times_eleven =
+    [this, &times_eleven_opt, &op0, &times_three]() -> const bvt &
+  {
+    if(!times_eleven_opt.has_value())
+      times_eleven_opt = add(times_three(), shift(op0, shiftt::SHIFT_LEFT, 3));
+    return *times_eleven_opt;
+  };
+  auto times_thirteen =
+    [this, &times_thirteen_opt, &op0, &times_five]() -> const bvt &
+  {
+    if(!times_thirteen_opt.has_value())
+      times_thirteen_opt = add(times_five(), shift(op0, shiftt::SHIFT_LEFT, 3));
+    return *times_thirteen_opt;
+  };
+  auto times_fifteen =
+    [this, &times_fifteen_opt, &op0, &times_seven]() -> const bvt &
+  {
+    if(!times_fifteen_opt.has_value())
+      times_fifteen_opt = add(times_seven(), shift(op0, shiftt::SHIFT_LEFT, 3));
+    return *times_fifteen_opt;
+  };
+#  endif
+
+  for(std::size_t op1_idx = 0; op1_idx + RADIX_GROUP_SIZE - 1 < op1.size();
+      op1_idx += RADIX_GROUP_SIZE)
+  {
+    const literalt &bit0 = op1[op1_idx];
+    const literalt &bit1 = op1[op1_idx + 1];
+#  if RADIX_MULTIPLIER >= 8
+    const literalt &bit2 = op1[op1_idx + 2];
+#    if RADIX_MULTIPLIER == 16
+    const literalt &bit3 = op1[op1_idx + 3];
+#    endif
+#  endif
+    bvt partial_sum;
+
+    if(
+      bit0.is_constant() && bit1.is_constant()
+#  if RADIX_MULTIPLIER >= 8
+      && bit2.is_constant()
+#    if RADIX_MULTIPLIER == 16
+      && bit3.is_constant()
+#    endif
+#  endif
+    )
+    {
+      if(bit0.is_false()) // *0
+      {
+        if(bit1.is_false()) // *00
+        {
+#  if RADIX_MULTIPLIER >= 8
+          if(bit2.is_false()) // *000
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0000
+              continue;
+            else // 1000
+              partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 3);
+#    else
+            continue;
+#    endif
+          }
+          else // *100
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0100
+              partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 2);
+            else // 1100
+              partial_sum =
+                shift(times_three(), shiftt::SHIFT_LEFT, op1_idx + 2);
+#    else
+            partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 2);
+#    endif
+          }
+#  else
+          continue;
+#  endif
+        }
+        else // *10
+        {
+#  if RADIX_MULTIPLIER >= 8
+          if(bit2.is_false()) // *010
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0010
+              partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 1);
+            else // 1010
+              partial_sum =
+                shift(times_five(), shiftt::SHIFT_LEFT, op1_idx + 1);
+#    else
+            partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 1);
+#    endif
+          }
+          else // *110
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0110
+              partial_sum =
+                shift(times_three(), shiftt::SHIFT_LEFT, op1_idx + 1);
+            else // 1110
+              partial_sum =
+                shift(times_seven(), shiftt::SHIFT_LEFT, op1_idx + 1);
+#    else
+            partial_sum = shift(times_three(), shiftt::SHIFT_LEFT, op1_idx + 1);
+#    endif
+          }
+#  else
+          partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 1);
+#  endif
+        }
+      }
+      else // *1
+      {
+        if(bit1.is_false()) // *01
+        {
+#  if RADIX_MULTIPLIER >= 8
+          if(bit2.is_false()) // *001
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0001
+              partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx);
+            else // 1001
+              partial_sum = shift(times_nine(), shiftt::SHIFT_LEFT, op1_idx);
+#    else
+            partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx);
+#    endif
+          }
+          else // *101
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0101
+              partial_sum = shift(times_five(), shiftt::SHIFT_LEFT, op1_idx);
+            else // 1101
+              partial_sum =
+                shift(times_thirteen(), shiftt::SHIFT_LEFT, op1_idx);
+#    else
+            partial_sum = shift(times_five(), shiftt::SHIFT_LEFT, op1_idx);
+#    endif
+          }
+#  else
+          partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx);
+#  endif
+        }
+        else // *11
+        {
+#  if RADIX_MULTIPLIER >= 8
+          if(bit2.is_false()) // *011
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0011
+              partial_sum = shift(times_three(), shiftt::SHIFT_LEFT, op1_idx);
+            else // 1011
+              partial_sum = shift(times_eleven(), shiftt::SHIFT_LEFT, op1_idx);
+#    else
+            partial_sum = shift(times_three(), shiftt::SHIFT_LEFT, op1_idx);
+#    endif
+          }
+          else // *111
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0111
+              partial_sum = shift(times_seven(), shiftt::SHIFT_LEFT, op1_idx);
+            else // 1111
+              partial_sum = shift(times_fifteen(), shiftt::SHIFT_LEFT, op1_idx);
+#    else
+            partial_sum = shift(times_seven(), shiftt::SHIFT_LEFT, op1_idx);
+#    endif
+          }
+#  else
+          partial_sum = shift(times_three(), shiftt::SHIFT_LEFT, op1_idx);
+#  endif
+        }
+      }
+    }
+    else
+    {
+      partial_sum = bvt(op1_idx, const_literal(false));
+      for(std::size_t op0_idx = 0; op0_idx + op1_idx < op0.size(); ++op0_idx)
+      {
+#  if RADIX_MULTIPLIER == 4
+        if(prop.cnf_handled_well())
+        {
+          literalt partial_sum_bit = prop.new_variable();
+          partial_sum.push_back(partial_sum_bit);
+
+          // 00
+          prop.lcnf({bit0, bit1, !partial_sum_bit});
+          // 01 -> sum = _op0
+          prop.lcnf({!bit0, bit1, !partial_sum_bit, _op0[op0_idx]});
+          prop.lcnf({!bit0, bit1, partial_sum_bit, !_op0[op0_idx]});
+          // 10 -> sum = (_op0 << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, !partial_sum_bit});
+          else
+          {
+            prop.lcnf({bit0, !bit1, !partial_sum_bit, _op0[op0_idx - 1]});
+            prop.lcnf({bit0, !bit1, partial_sum_bit, !_op0[op0_idx - 1]});
+          }
+          // 11 -> sum = times_three
+          prop.lcnf({!bit0, !bit1, !partial_sum_bit, times_three()[op0_idx]});
+          prop.lcnf({!bit0, !bit1, partial_sum_bit, !times_three()[op0_idx]});
+        }
+        else
+        {
+          partial_sum.push_back(prop.lselect(
+            !bit1,
+            prop.land(bit0, op0[op0_idx]), // 0x
+            prop.lselect(                  // 1x
+              !bit0,
+              op0_idx == 0 ? const_literal(false) : op0[op0_idx - 1],
+              times_three()[op0_idx])));
+        }
+#  elif RADIX_MULTIPLIER == 8
+        if(prop.cnf_handled_well())
+        {
+          literalt partial_sum_bit = prop.new_variable();
+          partial_sum.push_back(partial_sum_bit);
+
+          // 000
+          prop.lcnf({bit0, bit1, bit2, !partial_sum_bit});
+          // 001 -> sum = _op0
+          prop.lcnf({!bit0, bit1, bit2, !partial_sum_bit, _op0[op0_idx]});
+          prop.lcnf({!bit0, bit1, bit2, partial_sum_bit, !_op0[op0_idx]});
+          // 010 -> sum = (_op0 << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, bit2, !partial_sum_bit});
+          else
+          {
+            prop.lcnf({bit0, !bit1, bit2, !partial_sum_bit, _op0[op0_idx - 1]});
+            prop.lcnf({bit0, !bit1, bit2, partial_sum_bit, !_op0[op0_idx - 1]});
+          }
+          // 011 -> sum = times_three
+          prop.lcnf(
+            {!bit0, !bit1, bit2, !partial_sum_bit, times_three()[op0_idx]});
+          prop.lcnf(
+            {!bit0, !bit1, bit2, partial_sum_bit, !times_three()[op0_idx]});
+          // 100 -> sum = (_op0 << 2)
+          if(op0_idx == 0 || op0_idx == 1)
+            prop.lcnf({bit0, bit1, !bit2, !partial_sum_bit});
+          else
+          {
+            prop.lcnf({bit0, bit1, !bit2, !partial_sum_bit, _op0[op0_idx - 2]});
+            prop.lcnf({bit0, bit1, !bit2, partial_sum_bit, !_op0[op0_idx - 2]});
+          }
+          // 101 -> sum = times_five
+          prop.lcnf(
+            {!bit0, bit1, !bit2, !partial_sum_bit, times_five()[op0_idx]});
+          prop.lcnf(
+            {!bit0, bit1, !bit2, partial_sum_bit, !times_five()[op0_idx]});
+          // 110 -> sum = (times_three << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, !bit2, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               !partial_sum_bit,
+               times_three()[op0_idx - 1]});
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               partial_sum_bit,
+               !times_three()[op0_idx - 1]});
+          }
+          // 111 -> sum = times_seven
+          prop.lcnf(
+            {!bit0, !bit1, !bit2, !partial_sum_bit, times_seven()[op0_idx]});
+          prop.lcnf(
+            {!bit0, !bit1, !bit2, partial_sum_bit, !times_seven()[op0_idx]});
+        }
+        else
+        {
+          partial_sum.push_back(prop.lselect(
+            !bit2,
+            prop.lselect( // 0*
+              !bit1,
+              prop.land(bit0, op0[op0_idx]), // 00x
+              prop.lselect(                  // 01x
+                !bit0,
+                op0_idx == 0 ? const_literal(false) : op0[op0_idx - 1],
+                times_three()[op0_idx])),
+            prop.lselect( // 1*
+              !bit1,
+              prop.lselect( // 10x
+                !bit0,
+                op0_idx <= 1 ? const_literal(false) : op0[op0_idx - 2],
+                times_five()[op0_idx]),
+              prop.lselect( // 11x
+                !bit0,
+                op0_idx == 0 ? const_literal(false)
+                             : times_three()[op0_idx - 1],
+                times_seven()[op0_idx]))));
+        }
+#  elif RADIX_MULTIPLIER == 16
+        if(prop.cnf_handled_well())
+        {
+          literalt partial_sum_bit = prop.new_variable();
+          partial_sum.push_back(partial_sum_bit);
+
+          // 0000
+          prop.lcnf({bit0, bit1, bit2, bit3, !partial_sum_bit});
+          // 0001 -> sum = op0
+          prop.lcnf({!bit0, bit1, bit2, bit3, !partial_sum_bit, op0[op0_idx]});
+          prop.lcnf({!bit0, bit1, bit2, bit3, partial_sum_bit, !op0[op0_idx]});
+          // 0010 -> sum = (op0 << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, bit2, bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0, !bit1, bit2, bit3, !partial_sum_bit, op0[op0_idx - 1]});
+            prop.lcnf(
+              {bit0, !bit1, bit2, bit3, partial_sum_bit, !op0[op0_idx - 1]});
+          }
+          // 0011 -> sum = times_three
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             bit2,
+             bit3,
+             !partial_sum_bit,
+             times_three()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             bit2,
+             bit3,
+             partial_sum_bit,
+             !times_three()[op0_idx]});
+          // 0100 -> sum = (op0 << 2)
+          if(op0_idx == 0 || op0_idx == 1)
+            prop.lcnf({bit0, bit1, !bit2, bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0, bit1, !bit2, bit3, !partial_sum_bit, op0[op0_idx - 2]});
+            prop.lcnf(
+              {bit0, bit1, !bit2, bit3, partial_sum_bit, !op0[op0_idx - 2]});
+          }
+          // 0101 -> sum = times_five
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             !bit2,
+             bit3,
+             !partial_sum_bit,
+             times_five()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             !bit2,
+             bit3,
+             partial_sum_bit,
+             !times_five()[op0_idx]});
+          // 0110 -> sum = (times_three << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, !bit2, bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               bit3,
+               !partial_sum_bit,
+               times_three()[op0_idx - 1]});
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               bit3,
+               partial_sum_bit,
+               !times_three()[op0_idx - 1]});
+          }
+          // 0111 -> sum = times_seven
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             !bit2,
+             bit3,
+             !partial_sum_bit,
+             times_seven()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             !bit2,
+             bit3,
+             partial_sum_bit,
+             !times_seven()[op0_idx]});
+
+          // 1000 -> sum = (op0 << 3)
+          if(op0_idx == 0 || op0_idx == 1 || op0_idx == 2)
+            prop.lcnf({bit0, bit1, bit2, !bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0, bit1, bit2, !bit3, !partial_sum_bit, op0[op0_idx - 3]});
+            prop.lcnf(
+              {bit0, bit1, bit2, !bit3, partial_sum_bit, !op0[op0_idx - 3]});
+          }
+          // 1001 -> sum = times_nine
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             bit2,
+             !bit3,
+             !partial_sum_bit,
+             times_nine()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             bit2,
+             !bit3,
+             partial_sum_bit,
+             !times_nine()[op0_idx]});
+          // 1010 -> sum = (times_five << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, bit2, !bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               bit2,
+               !bit3,
+               !partial_sum_bit,
+               times_five()[op0_idx - 1]});
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               bit2,
+               !bit3,
+               partial_sum_bit,
+               !times_five()[op0_idx - 1]});
+          }
+          // 1011 -> sum = times_eleven
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             bit2,
+             !bit3,
+             !partial_sum_bit,
+             times_eleven()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             bit2,
+             !bit3,
+             partial_sum_bit,
+             !times_eleven()[op0_idx]});
+          // 1100 -> sum = (times_three << 2)
+          if(op0_idx == 0 || op0_idx == 1)
+            prop.lcnf({bit0, bit1, !bit2, !bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0,
+               bit1,
+               !bit2,
+               !bit3,
+               !partial_sum_bit,
+               times_three()[op0_idx - 2]});
+            prop.lcnf(
+              {bit0,
+               bit1,
+               !bit2,
+               !bit3,
+               partial_sum_bit,
+               !times_three()[op0_idx - 2]});
+          }
+          // 1101 -> sum = times_thirteen
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             !bit2,
+             !bit3,
+             !partial_sum_bit,
+             times_thirteen()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             !bit2,
+             !bit3,
+             partial_sum_bit,
+             !times_thirteen()[op0_idx]});
+          // 1110 -> sum = (times_seven << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, !bit2, !bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               !bit3,
+               !partial_sum_bit,
+               times_seven()[op0_idx - 1]});
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               !bit3,
+               partial_sum_bit,
+               !times_seven()[op0_idx - 1]});
+          }
+          // 1111 -> sum = times_fifteen
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             !bit2,
+             !bit3,
+             !partial_sum_bit,
+             times_fifteen()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             !bit2,
+             !bit3,
+             partial_sum_bit,
+             !times_fifteen()[op0_idx]});
+        }
+        else
+        {
+          partial_sum.push_back(prop.lselect(
+            !bit3,
+            prop.lselect( // 0*
+              !bit2,
+              prop.lselect( // 00*
+                !bit1,
+                prop.land(bit0, op0[op0_idx]), // 000x
+                prop.lselect(                  // 001x
+                  !bit0,
+                  op0_idx == 0 ? const_literal(false) : op0[op0_idx - 1],
+                  times_three()[op0_idx])),
+              prop.lselect( // 01*
+                !bit1,
+                prop.lselect( // 010x
+                  !bit0,
+                  op0_idx <= 1 ? const_literal(false) : op0[op0_idx - 2],
+                  times_five()[op0_idx]),
+                prop.lselect( // 011x
+                  !bit0,
+                  op0_idx == 0 ? const_literal(false)
+                               : times_three()[op0_idx - 1],
+                  times_seven()[op0_idx]))),
+            prop.lselect( // 1*
+              !bit2,
+              prop.lselect( // 10*
+                !bit1,
+                prop.lselect( // 100x
+                  !bit0,
+                  op0_idx <= 2 ? const_literal(false) : op0[op0_idx - 3],
+                  times_nine()[op0_idx]),
+                prop.lselect( // 101x
+                  !bit0,
+                  op0_idx == 0 ? const_literal(false)
+                               : times_five()[op0_idx - 1],
+                  times_eleven()[op0_idx])),
+              prop.lselect( // 11*
+                !bit1,
+                prop.lselect( // 110x
+                  !bit0,
+                  op0_idx <= 1 ? const_literal(false)
+                               : times_three()[op0_idx - 2],
+                  times_thirteen()[op0_idx]),
+                prop.lselect( // 111x
+                  !bit0,
+                  op0_idx == 0 ? const_literal(false)
+                               : times_seven()[op0_idx - 1],
+                  times_fifteen()[op0_idx])))));
+        }
+#  else
+#    error Unsupported radix
+#  endif
+      }
+    }
+
+    pps.push_back(std::move(partial_sum));
+  }
+
+  if(op1.size() % RADIX_GROUP_SIZE == 1)
+  {
+    if(op0.size() == op1.size())
+    {
+      if(pps.empty())
+        pps.push_back(bvt(op0.size(), const_literal(false)));
+
+      // This is the partial product of the MSB of op1 with op0, which is all
+      // zeros except for (possibly) the MSB. Since we don't need to account for
+      // any carry out of adding this partial product, we just need to compute
+      // the sum the MSB of one of the partial products and this partial
+      // product, we is an xor of just those bits.
+      pps.back().back() =
+        prop.lxor(pps.back().back(), prop.land(op0[0], op1.back()));
+    }
+    else
+    {
+      bvt partial_sum = bvt(op1.size() - 1, const_literal(false));
+      for(const auto &lit : op0)
+      {
+        partial_sum.push_back(prop.land(lit, op1.back()));
+        if(partial_sum.size() == op0.size())
+          break;
+      }
+      pps.push_back(std::move(partial_sum));
+    }
+  }
+#  if RADIX_MULTIPLIER >= 8
+  else if(op1.size() % RADIX_GROUP_SIZE == 2)
+  {
+    const literalt &bit0 = op1[op1.size() - 2];
+    const literalt &bit1 = op1[op1.size() - 1];
+
+    bvt partial_sum = bvt(op1.size() - 2, const_literal(false));
+    for(std::size_t op0_idx = 0; op0_idx < 2; ++op0_idx)
+    {
+      if(prop.cnf_handled_well())
+      {
+        literalt partial_sum_bit = prop.new_variable();
+        partial_sum.push_back(partial_sum_bit);
+        // 00
+        prop.lcnf({bit0, bit1, !partial_sum_bit});
+        // 01 -> sum = op0
+        prop.lcnf({!bit0, bit1, !partial_sum_bit, op0[op0_idx]});
+        prop.lcnf({!bit0, bit1, partial_sum_bit, !op0[op0_idx]});
+        // 10 -> sum = (op0 << 1)
+        if(op0_idx == 0)
+          prop.lcnf({bit0, !bit1, !partial_sum_bit});
+        else
+        {
+          prop.lcnf({bit0, !bit1, !partial_sum_bit, op0[op0_idx - 1]});
+          prop.lcnf({bit0, !bit1, partial_sum_bit, !op0[op0_idx - 1]});
+        }
+        // 11 -> sum = times_three
+        prop.lcnf({!bit0, !bit1, !partial_sum_bit, times_three()[op0_idx]});
+        prop.lcnf({!bit0, !bit1, partial_sum_bit, !times_three()[op0_idx]});
+      }
+      else
+      {
+        partial_sum.push_back(prop.lselect(
+          !bit1,
+          prop.land(bit0, op0[op0_idx]), // 0x
+          prop.lselect(                  // 1x
+            !bit0,
+            op0_idx == 0 ? const_literal(false) : op0[op0_idx - 1],
+            times_three()[op0_idx])));
+      }
+    }
+
+    pps.push_back(std::move(partial_sum));
+  }
+#  endif
+#  if RADIX_MULTIPLIER == 16
+  else if(op1.size() % RADIX_GROUP_SIZE == 3)
+  {
+    const literalt &bit0 = op1[op1.size() - 3];
+    const literalt &bit1 = op1[op1.size() - 2];
+    const literalt &bit2 = op1[op1.size() - 1];
+
+    bvt partial_sum = bvt(op1.size() - 3, const_literal(false));
+    for(std::size_t op0_idx = 0; op0_idx < 3; ++op0_idx)
+    {
+      if(prop.cnf_handled_well())
+      {
+        literalt partial_sum_bit = prop.new_variable();
+        partial_sum.push_back(partial_sum_bit);
+        // 000
+        prop.lcnf({bit0, bit1, bit2, !partial_sum_bit});
+        // 001 -> sum = op0
+        prop.lcnf({!bit0, bit1, bit2, !partial_sum_bit, op0[op0_idx]});
+        prop.lcnf({!bit0, bit1, bit2, partial_sum_bit, !op0[op0_idx]});
+        // 010 -> sum = (op0 << 1)
+        if(op0_idx == 0)
+          prop.lcnf({bit0, !bit1, bit2, !partial_sum_bit});
+        else
+        {
+          prop.lcnf({bit0, !bit1, bit2, !partial_sum_bit, op0[op0_idx - 1]});
+          prop.lcnf({bit0, !bit1, bit2, partial_sum_bit, !op0[op0_idx - 1]});
+        }
+        // 011 -> sum = times_three
+        prop.lcnf(
+          {!bit0, !bit1, bit2, !partial_sum_bit, times_three()[op0_idx]});
+        prop.lcnf(
+          {!bit0, !bit1, bit2, partial_sum_bit, !times_three()[op0_idx]});
+        // 100 -> sum = (op0 << 2)
+        if(op0_idx == 0 || op0_idx == 1)
+          prop.lcnf({bit0, bit1, !bit2, !partial_sum_bit});
+        else
+        {
+          prop.lcnf({bit0, bit1, !bit2, !partial_sum_bit, op0[op0_idx - 2]});
+          prop.lcnf({bit0, bit1, !bit2, partial_sum_bit, !op0[op0_idx - 2]});
+        }
+        // 101 -> sum = times_five
+        prop.lcnf(
+          {!bit0, bit1, !bit2, !partial_sum_bit, times_five()[op0_idx]});
+        prop.lcnf(
+          {!bit0, bit1, !bit2, partial_sum_bit, !times_five()[op0_idx]});
+        // 110 -> sum = (times_three << 1)
+        if(op0_idx == 0)
+          prop.lcnf({bit0, !bit1, !bit2, !partial_sum_bit});
+        else
+        {
+          prop.lcnf(
+            {bit0, !bit1, !bit2, !partial_sum_bit, times_three()[op0_idx - 1]});
+          prop.lcnf(
+            {bit0, !bit1, !bit2, partial_sum_bit, !times_three()[op0_idx - 1]});
+        }
+        // 111 -> sum = times_seven
+        prop.lcnf(
+          {!bit0, !bit1, !bit2, !partial_sum_bit, times_seven()[op0_idx]});
+        prop.lcnf(
+          {!bit0, !bit1, !bit2, partial_sum_bit, !times_seven()[op0_idx]});
+      }
+      else
+      {
+        partial_sum.push_back(prop.lselect(
+          !bit2,
+          prop.lselect( // 0*
+            !bit1,
+            prop.land(bit0, op0[op0_idx]), // 00x
+            prop.lselect(                  // 01x
+              !bit0,
+              op0_idx == 0 ? const_literal(false) : op0[op0_idx - 1],
+              times_three()[op0_idx])),
+          prop.lselect( // 1*
+            !bit1,
+            prop.lselect( // 10x
+              !bit0,
+              op0_idx <= 1 ? const_literal(false) : op0[op0_idx - 2],
+              times_five()[op0_idx]),
+            prop.lselect( // 11x
+              !bit0,
+              op0_idx == 0 ? const_literal(false) : times_three()[op0_idx - 1],
+              times_seven()[op0_idx]))));
+      }
+    }
+
+    pps.push_back(std::move(partial_sum));
+  }
+#  endif
+#endif
 
   if(pps.empty())
     return zeros(op0.size());
@@ -950,6 +1894,8 @@ bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
     return wallace_tree(pps);
 #elif defined(DADDA_TREE)
     return dadda_tree(pps);
+#elif defined(COMBA)
+    return comba_column_wise(pps);
 #else
     bvt product = pps.front();
 
@@ -961,6 +1907,724 @@ bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
   }
 }
 
+bvt bv_utilst::unsigned_karatsuba_full_multiplier(
+  const bvt &op0,
+  const bvt &op1)
+{
+  // We review symbolic encoding of multiplication in context of sw
+  // verification, bit width is 2^n, distinguish truncating (x mod 2^2^n) from
+  // double-output-width multiplication, truncating Karatsuba is 2 truncating
+  // half-width multiplication plus one double-output-width of half width, for
+  // double output width Karatsuba idea is challenge to avoid width extension,
+  // check Wikipedia edit history
+
+  PRECONDITION(op0.size() == op1.size());
+  const std::size_t op_size = op0.size();
+  PRECONDITION(op_size > 0);
+  PRECONDITION((op_size & (op_size - 1)) == 0);
+
+  if(op_size == 1)
+    return {prop.land(op0[0], op1[0]), const_literal(false)};
+
+  const std::size_t half_op_size = op_size >> 1;
+
+  bvt x0{op0.begin(), op0.begin() + half_op_size};
+  bvt x1{op0.begin() + half_op_size, op0.end()};
+
+  bvt y0{op1.begin(), op1.begin() + half_op_size};
+  bvt y1{op1.begin() + half_op_size, op1.end()};
+
+  bvt z0 = unsigned_karatsuba_full_multiplier(x0, y0);
+  bvt z2 = unsigned_karatsuba_full_multiplier(x1, y1);
+
+  bvt x0_sub = zero_extension(x0, half_op_size + 1);
+  bvt x1_sub = zero_extension(x1, half_op_size + 1);
+
+  bvt y0_sub = zero_extension(y0, half_op_size + 1);
+  bvt y1_sub = zero_extension(y1, half_op_size + 1);
+
+  bvt x1_minus_x0_ext = sub(x1_sub, x0_sub);
+  literalt x1_minus_x0_sign = sign_bit(x1_minus_x0_ext);
+  bvt x1_minus_x0_abs = absolute_value(x1_minus_x0_ext);
+  x1_minus_x0_abs.pop_back();
+  bvt y0_minus_y1_ext = sub(y0_sub, y1_sub);
+  literalt y0_minus_y1_sign = sign_bit(y0_minus_y1_ext);
+  bvt y0_minus_y1_abs = absolute_value(y0_minus_y1_ext);
+  y0_minus_y1_abs.pop_back();
+  bvt sub_mult =
+    unsigned_karatsuba_full_multiplier(x1_minus_x0_abs, y0_minus_y1_abs);
+  bvt sub_mult_ext = zero_extension(sub_mult, op_size + 1);
+  bvt z1_ext = add_sub(
+    add(zero_extension(z0, op_size + 1), zero_extension(z2, op_size + 1)),
+    sub_mult_ext,
+    prop.lxor(x1_minus_x0_sign, y0_minus_y1_sign));
+
+  bvt z0_full = zero_extension(z0, op_size << 1);
+  bvt z1_full =
+    zero_extension(concatenate(zeros(half_op_size), z1_ext), op_size << 1);
+  bvt z2_full = concatenate(zeros(op_size), z2);
+
+  return add(add(z0_full, z1_full), z2_full);
+}
+
+bvt bv_utilst::unsigned_karatsuba_multiplier(const bvt &_op0, const bvt &_op1)
+{
+  if(_op0.size() != _op1.size())
+    return unsigned_multiplier(_op0, _op1);
+
+  const std::size_t op_size = _op0.size();
+  if(op_size == 1)
+    return {prop.land(_op0[0], _op1[0])};
+
+  // Make sure we work with operands the length of which are powers of two
+  const std::size_t log2 = address_bits(op_size);
+  PRECONDITION(sizeof(std::size_t) * CHAR_BIT > log2);
+  const std::size_t two_to_log2 = (std::size_t)1 << log2;
+  bvt a = zero_extension(_op0, two_to_log2);
+  bvt b = zero_extension(_op1, two_to_log2);
+
+  const std::size_t half_op_size = two_to_log2 >> 1;
+
+  // We split each of the operands in half and treat them as coefficients of a
+  // polynomial a * 2^half_op_size + b. Straightforward polynomial
+  // multiplication then yields
+  // a0 * a1 * 2^op_size + (a0 * b1 + a1 * b0) * 2^half_op_size + b0 * b1
+  // These would be four multiplications (the operands of which have half the
+  // original bit width):
+  // z0 = b0 * b1
+  // z1 = a0 * b1 + a1 * b0
+  // z2 = a0 * a1
+  // Karatsuba's insight is that these four multiplications can be expressed
+  // using just three multiplications:
+  // z1 = (a0 - b0) * (b1 - a1) + z0 + z2
+  //
+  // Worked 4-bit example, 4-bit result:
+  // abcd * efgh -> 4-bit result
+  // cd * gh -> 4-bit result
+  // cd * ef -> 2-bit result
+  // ab * gh -> 2-bit result
+  // d * h -> 2-bit result
+  // c * g -> 2-bit result
+  // (c - d) * (h - g) + dh + cg; use an extra sign bit for each of the
+  // subtractions, and conditionally negate the product by xor-ing those sign
+  // bits; dh + cg is a 2-bit addition (with possible results 0, 1, 2); the
+  // product has possible values (-1, 0, 1); the final sum cannot evaluate to -1
+  // as
+  // * c=1, d=0, h=0, g=1 (1 * -1) implies cg=1
+  // * c=0, d=1, h=1, g=0 (-1 * 1) implies dh=1
+  // Therefore, after adding (dh + cg) the multiplication can safely be added
+  // over just 2 bits.
+
+  bvt x0{a.begin(), a.begin() + half_op_size};
+  bvt x1{a.begin() + half_op_size, a.end()};
+  bvt y0{b.begin(), b.begin() + half_op_size};
+  bvt y1{b.begin() + half_op_size, b.end()};
+
+  bvt z0 = unsigned_karatsuba_full_multiplier(x0, y0);
+  bvt z1 = add(
+    unsigned_karatsuba_multiplier(x1, y0),
+    unsigned_karatsuba_multiplier(x0, y1));
+  bvt z1_full = concatenate(zeros(half_op_size), z1);
+
+  bvt result = add(z0, z1_full);
+  CHECK_RETURN(result.size() >= op_size);
+  if(result.size() > op_size)
+    result.resize(op_size);
+  return result;
+}
+
+bvt bv_utilst::unsigned_toom_cook_multiplier(const bvt &_op0, const bvt &_op1)
+{
+  PRECONDITION(_op0.size() == _op1.size());
+  PRECONDITION(!_op0.empty());
+
+  if(_op0.size() == 1)
+    return {prop.land(_op0[0], _op1[0])};
+
+  // break up _op0, _op1 in groups of at most GROUP_SIZE bits
+#define GROUP_SIZE 8
+  const std::size_t d_bits =
+    2 * GROUP_SIZE +
+    2 * address_bits((_op0.size() + GROUP_SIZE - 1) / GROUP_SIZE);
+  std::vector<bvt> a, b, c_ops, d;
+  for(std::size_t i = 0; i < _op0.size(); i += GROUP_SIZE)
+  {
+    std::size_t u = std::min(i + GROUP_SIZE, _op0.size());
+    a.emplace_back(_op0.begin() + i, _op0.begin() + u);
+    b.emplace_back(_op1.begin() + i, _op1.begin() + u);
+
+    c_ops.push_back(zeros(i));
+    d.push_back(prop.new_variables(d_bits));
+    c_ops.back().insert(c_ops.back().end(), d.back().begin(), d.back().end());
+    c_ops.back() = zero_extension(c_ops.back(), _op0.size());
+  }
+  for(std::size_t i = a.size(); i < 2 * a.size() - 1; ++i)
+  {
+    d.push_back(prop.new_variables(d_bits));
+  }
+
+  // r(0)
+  bvt r_0 = d[0];
+  prop.l_set_to_true(equal(
+    r_0,
+    unsigned_multiplier(
+      zero_extension(a[0], r_0.size()), zero_extension(b[0], r_0.size()))));
+
+  for(std::size_t j = 1; j < a.size(); ++j)
+  {
+    // r(2^(j-1))
+    bvt r_j = zero_extension(
+      d[0], std::min(_op0.size(), d[0].size() + (j - 1) * (d.size() - 1)));
+    for(std::size_t i = 1; i < d.size(); ++i)
+    {
+      r_j = add(
+        r_j,
+        shift(
+          zero_extension(d[i], r_j.size()), shiftt::SHIFT_LEFT, (j - 1) * i));
+    }
+
+    bvt a_even = zero_extension(a[0], r_j.size());
+    for(std::size_t i = 2; i < a.size(); i += 2)
+    {
+      a_even = add(
+        a_even,
+        shift(
+          zero_extension(a[i], a_even.size()),
+          shiftt::SHIFT_LEFT,
+          (j - 1) * i));
+    }
+    bvt a_odd = zero_extension(a[1], r_j.size());
+    for(std::size_t i = 3; i < a.size(); i += 2)
+    {
+      a_odd = add(
+        a_odd,
+        shift(
+          zero_extension(a[i], a_odd.size()),
+          shiftt::SHIFT_LEFT,
+          (j - 1) * (i - 1)));
+    }
+    bvt b_even = zero_extension(b[0], r_j.size());
+    for(std::size_t i = 2; i < b.size(); i += 2)
+    {
+      b_even = add(
+        b_even,
+        shift(
+          zero_extension(b[i], b_even.size()),
+          shiftt::SHIFT_LEFT,
+          (j - 1) * i));
+    }
+    bvt b_odd = zero_extension(b[1], r_j.size());
+    for(std::size_t i = 3; i < b.size(); i += 2)
+    {
+      b_odd = add(
+        b_odd,
+        shift(
+          zero_extension(b[i], b_odd.size()),
+          shiftt::SHIFT_LEFT,
+          (j - 1) * (i - 1)));
+    }
+
+    prop.l_set_to_true(equal(
+      r_j,
+      unsigned_multiplier(
+        add(a_even, shift(a_odd, shiftt::SHIFT_LEFT, j - 1)),
+        add(b_even, shift(b_odd, shiftt::SHIFT_LEFT, j - 1)))));
+
+    // r(-2^(j-1))
+    bvt r_minus_j = zero_extension(
+      d[0], std::min(_op0.size(), d[0].size() + (j - 1) * (d.size() - 1)));
+    for(std::size_t i = 1; i < d.size(); ++i)
+    {
+      if(i % 2 == 1)
+      {
+        r_minus_j = sub(
+          r_minus_j,
+          shift(
+            zero_extension(d[i], r_minus_j.size()),
+            shiftt::SHIFT_LEFT,
+            (j - 1) * i));
+      }
+      else
+      {
+        r_minus_j = add(
+          r_minus_j,
+          shift(
+            zero_extension(d[i], r_minus_j.size()),
+            shiftt::SHIFT_LEFT,
+            (j - 1) * i));
+      }
+    }
+
+    prop.l_set_to_true(equal(
+      r_minus_j,
+      unsigned_multiplier(
+        sub(a_even, shift(a_odd, shiftt::SHIFT_LEFT, j - 1)),
+        sub(b_even, shift(b_odd, shiftt::SHIFT_LEFT, j - 1)))));
+  }
+
+  if(c_ops.empty())
+    return zeros(_op0.size());
+  else
+  {
+#ifdef WALLACE_TREE
+    return wallace_tree(c_ops);
+#elif defined(DADDA_TREE)
+    return dadda_tree(c_ops);
+#elif defined(COMBA)
+    return comba_column_wise(c_ops);
+#else
+    bvt product = c_ops.front();
+
+    for(auto it = std::next(c_ops.begin()); it != c_ops.end(); ++it)
+      product = add(product, *it);
+
+    return product;
+#endif
+  }
+}
+
+bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
+  const bvt &_a,
+  const bvt &_b)
+{
+  // http://malte-leip.net/beschreibung_ssa.pdf,
+  // https://de.wikipedia.org/wiki/Sch%C3%B6nhage-Strassen-Algorithmus
+  // Isabelle proof: https://mediatum.ub.tum.de/doc/1717658/1717658.pdf
+  bvt a = _a;
+#if 1
+  // bvt b = a;
+  bvt b = _b;
+#else
+  bvt b = _b;
+  a.resize(14);
+  b.resize(14);
+#endif
+
+  PRECONDITION(a.size() == b.size());
+  const std::size_t op_size = a.size();
+  // delta.size() <= result_size doesn't hold for op_size <= 3
+  if(op_size <= 3)
+    return unsigned_multiplier(a, b);
+
+  // Running example: we want to multiply 213 by 15 as 8- or 9-bit integers.
+  // That is, we seek to multiply 11010101 (011010101) by 00001111 (000001111).
+  //                              ^bit 7 ^bit 0
+  // The expected result is 123 as both an 8-bit and 9-bit result (001111011).
+
+  // We compute the result modulo a Fermat number F_m = 2^2^m + 1. The maximum
+  // result when multiplying a by b (with their sizes being the same per the
+  // precondition above) is 2^2*op_size - 1.
+  // TODO: we don't actually need a full multiplier, a result with up to op_size
+  // bits is sufficient for our purposes.
+  // Hence we require 2^2^m >= 2^2*op_size, i.e., 2^m >= 2*op_size, or
+  // m >= log_2(op_size) + 1.
+  // For our examples m will be 4 and 5, respectively, with Fermat numbers
+  // 2^16 + 1 and 2^32 + 1.
+  const std::size_t m = address_bits(op_size) + 1;
+  std::cerr << "m: " << m << std::endl;
+
+  // Extend bit width to 2^(m + 1) = op_size (rounded to next power of 2) * 4
+  // For our examples, extended bit widths will be 32 and 64.
+  PRECONDITION(sizeof(std::size_t) * CHAR_BIT > m + 1);
+  const std::size_t two_to_m_plus_1 = (std::size_t)1 << (m + 1);
+  std::cerr << "a: " << beautify(a) << std::endl;
+  std::cerr << "b: " << beautify(b) << std::endl;
+  bvt a_ext = zero_extension(a, two_to_m_plus_1);
+  bvt b_ext = zero_extension(b, two_to_m_plus_1);
+
+  // We need to distinguish whether m is even or odd
+  // m = 2n - 1 for odd m and m = 2n -2 for even m
+  // For our 8-bit inputs we have m = 4 and, therefore, n = 3.
+  // For our 9-bit inputs we have m = 5 and, therefore, n = 3.
+  const std::size_t n = m % 2 == 1 ? (m + 1) / 2 : m / 2 + 1;
+  std::cerr << "n: " << n << std::endl;
+
+  // For even m create 2^n (of 2^(n - 1) bits) chunks from a_ext, b_ext (for our
+  // 8-bit inputs we have chunk_size = 4 with num_chunks = 8).
+  // For odd m create 2^(n + 1) chunks (of 2^(n - 1) bits) from a_ext, b_ext;
+  // a_0 will be bit positions 0 through to 2^(n - 1) - 1, a_{2^(n + 1) - 1}
+  // will be bit positions up to 2^(m + 1) - 1.
+  // For our 9-bit inputs we have chunk_size = 4 with num_chunks = 16
+  const std::size_t chunk_size = (std::size_t)1 << (n - 1);
+  const std::size_t num_chunks = two_to_m_plus_1 / chunk_size;
+  CHECK_RETURN(
+    num_chunks == m % 2 ? (std::size_t)1 << (n + 1) : (std::size_t)1 << n);
+  std::cerr << "chunk_size: " << chunk_size << std::endl;
+  std::cerr << "num_chunks: " << num_chunks << std::endl;
+  std::cerr << "address_bits(num_chunks): " << address_bits(num_chunks)
+            << std::endl;
+
+  std::vector<bvt> a_rho, b_sigma;
+  a_rho.reserve(num_chunks);
+  b_sigma.reserve(num_chunks);
+  for(std::size_t i = 0; i < num_chunks; ++i)
+  {
+    a_rho.emplace_back(
+      a_ext.begin() + i * chunk_size, a_ext.begin() + (i + 1) * chunk_size);
+    std::cerr << "a_rho[" << i << "]: " << beautify(a_rho.back()) << std::endl;
+    b_sigma.emplace_back(
+      b_ext.begin() + i * chunk_size, b_ext.begin() + (i + 1) * chunk_size);
+    std::cerr << "b_sigma[" << i << "]: " << beautify(b_sigma.back())
+              << std::endl;
+  }
+  // For our example we now have
+  // a_rho = [ 0101, 1101, 0000, ..., 0000 ]
+  // b_sigma = [ 1111, 0000, 0000, ..., 0000 ]
+
+  // Compute gamma_r = \sum_{i + j = r} a_i * b_j with bit width 3n + 5 with r
+  // ranging from 0 to 2^(n + 2) - 1 (to 2^(n + 1) - 1 when m is even).
+  // For our example this will be additions/multiplications of width 14
+  // (implying that school book multiplication would be cheaper, as is the case
+  // for all operand lengths below 32 bits).
+  // TODO: all subsequent steps seem to be using mod 2^(n + 2) (mod 2^(n + 1)
+  // when m is even), so it may be sufficient to do this over n + 2 bits instead
+  // of 3n + 5.
+  std::vector<bvt> gamma_tau{num_chunks * 2, zeros(3 * n + 5)};
+  for(std::size_t tau = 0; tau < num_chunks * 2; ++tau)
+  {
+    for(std::size_t rho = tau < num_chunks ? 0 : tau - num_chunks + 1;
+        rho < num_chunks && rho <= tau;
+        ++rho)
+    {
+      const std::size_t sigma = tau - rho;
+      std::cerr << "Inner multiplication a_" << rho << " * b_" << sigma;
+      auto inner_product = zero_extension(
+        unsigned_multiplier(
+          zero_extension(a_rho[rho], chunk_size * 2),
+          zero_extension(b_sigma[sigma], chunk_size * 2)),
+        3 * n + 5);
+      std::cerr << " = " << beautify(inner_product) << std::endl;
+      gamma_tau[tau] = add(gamma_tau[tau], inner_product);
+      std::cerr << "gamma_tau[" << tau << "] = " << beautify(gamma_tau[tau])
+                << std::endl;
+    }
+  }
+  // For our example we obtain
+  // gamma_tau = [ 00 0000 0100 1011, 00 0000 1100 0011, 0.... ]
+
+  // Compute c_tau over bit width n + 2 (n + 1 when m is even) as gamma_tau +
+  // gamma_{tau + 2^(n + 1)} (gamma_{tau + 2^n} when m is even).
+  std::vector<bvt> c_tau;
+  c_tau.reserve(num_chunks);
+  for(std::size_t tau = 0; tau < num_chunks; ++tau)
+  {
+    // std::cerr << "gamma_tau[" << tau << "]: " << beautify(gamma_tau[tau]) << std::endl;
+    c_tau.push_back(add(gamma_tau[tau], gamma_tau[tau + num_chunks]));
+    CHECK_RETURN(c_tau.back().size() >= address_bits(num_chunks) + 1);
+    c_tau.back().resize(address_bits(num_chunks) + 1);
+    std::cerr << "c_tau[" << tau << "]: " << beautify(c_tau[tau]) << std::endl;
+  }
+  // For our example we obtain
+  // c_tau = [ 01011, 00011, 0... ]
+
+  // Compute z_j = c_j - c_{j + 2^n} (mod 2^(n + 2)) (mod 2^(n + 1) and c_{j +
+  // 2^(n - 1)} when m is even)
+  std::vector<bvt> z_j;
+  z_j.reserve(num_chunks / 2);
+  for(std::size_t j = 0; j < num_chunks / 2; ++j)
+  {
+    z_j.push_back(sub(c_tau[j], c_tau[j + num_chunks / 2]));
+    // z_j.back().resize(address_bits(num_chunks) + 1);
+    std::cerr << "z_" << j << " = " << beautify(z_j.back()) << std::endl;
+  }
+  // For our example we have z_j = c_tau as all elements beyond the second one
+  // are zeros.
+
+  // Compute z_j mod F_n using number-theoretic transform with omega = 2 for
+  // odd m and omega = 4 for even m.
+  // For our examples we have F_n = 2^2^n + 1 = 257 with 2 being a 2^(n + 1)-th
+  // root of unity, i.e., 2^16 \equiv 1 (mod 257) (with 4 being a 2^n-root of
+  // unity, i.e., 4^8 \equiv 1 (mod 257). The DFT table for omega = 2 would be
+  // 1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
+  // 1   2   4   8  16  32  64 128  -1  -2  -4  -8 -16 -32 -64 -128
+  // 1   4  16  64  -1  -4 -16 -64   1   4  16  64  -1  -4 -16 -64
+  // 1   8  64  -2 -16 -128  4  32  -1  -8 -64   2  16 128  -4 -32
+  // 1  16  -1 -16   1  16  -1 -16   1  16  -1 -16   1  16  -1 -16
+  // 1  32  -4 -128 16  -2 -64   8  -1 -32   4 128 -16   2  64  -8
+  // 1  64 -16   4  -1 -64  16  -2   1  64 -16   4  -1 -64  16  -4
+  // 1 128 -64  32 -16   8  -2   2  -1 -128 64 -32  16  -8   4  -2
+  // 1  -1   1  -1   1  -1   1  -1   1  -1   1  -1   1  -1   1  -1
+  // 1  -2   4  -8  16 -32  64 -128 -1   2  -4   8 -16  32 -64 128
+  // 1  -4  16 -64  -1   4 -16  64   1  -4  16 -64  -1   4 -16  64
+  // 1  -8  64   2 -16 128   4 -32  -1   8 -64  -2  16 -128 -4  32
+  // 1 -16  -1  16   1 -16  -1  16   1 -16  -1  16   1 -16  -1  16
+  // 1 -32  -4 128  16   2 -64  -8  -1  32   4 -128 -16 -2  64   8
+  // 1 -64 -16  -4  -1  64  16   4   1 -64 -16  -4  -1  64  16   4
+  // 1 -128 -64 -32 -16 -8  -4  -2  -1 128  64  32  16   8   4   2
+  // For fast NTT (less than O(n^2)) use Cooley-Tukey for NTT, then perform
+  // element-wise multiplication, and finally apply Gentleman-Sande for
+  // inverse NTT.
+
+  // Addition mod F_n with overflow
+  auto cyclic_add = [this](const bvt &x, const bvt &y) {
+    PRECONDITION(x.size() == y.size());
+
+    auto result_with_overflow = adder(x, y, const_literal(false));
+    if(result_with_overflow.second.is_false())
+      return result_with_overflow.first;
+
+    std::cerr << "OVERFLOW" << std::endl;
+    return add(
+      result_with_overflow.first,
+      zero_extension(bvt{1, result_with_overflow.second}, x.size()));
+  };
+
+  // Compute NTT
+  std::vector<bvt> a_j, b_j;
+  a_j.reserve(num_chunks);
+  b_j.reserve(num_chunks);
+  for(std::size_t j = 0; j < num_chunks; ++j)
+  {
+    // All NTT steps are mod F_n, i.e., mod 2^2^n + 1, which implies we need
+    // 2^(n + 1) bits to represent numbers
+    a_j.push_back(zero_extension(a_rho[j], (std::size_t)1 << (n + 1)));
+    b_j.push_back(zero_extension(b_sigma[j], (std::size_t)1 << (n + 1)));
+  }
+  // Use in-place iterative Cooley-Tukey
+  std::vector<bvt> Aa, Ab;
+  Aa.reserve(num_chunks);
+  Ab.reserve(num_chunks);
+  // In the following we use k represented as bits k_{n - 1}...k_0 and
+  // j_0...j_{n - 1}, i.e., the most-significant bit of k is k_{n - 1} while the
+  // MSB for j is j_0.
+  for(std::size_t k = 0; k < num_chunks; ++k)
+  {
+    // reverse n (n - 1 if m is even) bits of k
+    std::size_t j = 0;
+    for(std::size_t nu = 0; nu < address_bits(num_chunks); ++nu)
+    {
+      j <<= 1; // the initial shift has no effect
+      j |= (k & (1 << nu)) >> nu;
+    }
+    std::cerr << "k=" << k << " yields j=" << j << std::endl;
+    Aa.push_back(a_j[j]);
+    Ab.push_back(b_j[j]);
+    std::cerr << "Aa[0](" << k << "): " << beautify(Aa.back()) << std::endl;
+  }
+  for(std::size_t nu = 0; nu < address_bits(num_chunks); ++nu)
+  {
+    const std::size_t bit_nu = (std::size_t)1 << nu;
+    std::size_t bits_up_to_nu = 0;
+    for(std::size_t i = 0; i < nu; ++i)
+      bits_up_to_nu |= 1 << i;
+
+    // we only need odd ones
+    for(std::size_t k = 1; k < num_chunks; k += 2)
+    {
+      if((k & bit_nu) == 0)
+        continue;
+
+      bvt Aa_nu_bit_is_zero = Aa[k & ~bit_nu];
+      bvt Ab_nu_bit_is_zero = Ab[k & ~bit_nu];
+
+      std::cerr << "Round " << (nu + 1) << ", k=" << k
+                << ", k & ~bit_nu=" << (k & ~bit_nu) << std::endl;
+      const std::size_t chi = (k & bits_up_to_nu)
+                              << (address_bits(num_chunks) - 1 - nu);
+      std::cerr << "k & bits_up_to_nu=" << (k & bits_up_to_nu)
+                << ", chi=" << chi << std::endl;
+      const std::size_t omega = m % 2 == 1 ? 2 : 4;
+      const std::size_t shift_dist = chi * omega / 2;
+
+      if(nu > 0) // no need to update even indices
+      {
+        Aa[k & ~bit_nu] = cyclic_add(
+          Aa_nu_bit_is_zero, shift(Aa[k], shiftt::ROTATE_LEFT, shift_dist));
+        Ab[k & ~bit_nu] = cyclic_add(
+          Ab_nu_bit_is_zero, shift(Ab[k], shiftt::ROTATE_LEFT, shift_dist));
+        std::cerr << "shift_dist: " << shift_dist << std::endl;
+        std::cerr << "Aa[" << (nu + 1) << "](" << (k & ~bit_nu)
+                  << "): " << beautify(Aa[k & ~bit_nu]) << std::endl;
+#if 0
+        std::cerr << "Ab[" << (nu + 1) << "](" << (k & ~bit_nu)
+                  << "): " << beautify(Ab[k & ~bit_nu]) << std::endl;
+#endif
+      }
+
+      // subtraction mod F_n is addition of subtrahend cyclically shifted 2^n
+      // positions to the left
+      const std::size_t shift_dist_for_sub = shift_dist + ((std::size_t)1 << n);
+      Aa[k] = cyclic_add(
+        Aa_nu_bit_is_zero,
+        shift(Aa[k], shiftt::ROTATE_LEFT, shift_dist_for_sub));
+      Ab[k] = cyclic_add(
+        Ab_nu_bit_is_zero,
+        shift(Ab[k], shiftt::ROTATE_LEFT, shift_dist_for_sub));
+      std::cerr << "shift_dist_for_sub: " << shift_dist_for_sub << std::endl;
+      std::cerr << "Aa[" << (nu + 1) << "](" << k << "): " << beautify(Aa[k])
+                << std::endl;
+#if 0
+      std::cerr << "Ab[" << (nu + 1) << "](" << k << "): " << beautify(Ab[k])
+                << std::endl;
+#endif
+    }
+  }
+
+  // Either compute u - v (if u > v), else u - v + 2^2^n + 1
+  auto reduce_to_mod_F_n = [this](const bvt &x, std::size_t n) {
+    const std::size_t two_to_power_of_n = x.size() / 2;
+    PRECONDITION(two_to_power_of_n == (std::size_t)1 << n);
+    // std::cerr << "two_to_power_of_n: " << two_to_power_of_n << std::endl;
+    const bvt u =
+      zero_extension(bvt{x.begin(), x.begin() + two_to_power_of_n}, x.size());
+    // std::cerr << "u: " << beautify(u) << std::endl;
+    const bvt v =
+      zero_extension(bvt{x.begin() + two_to_power_of_n, x.end()}, x.size());
+    // std::cerr << "v: " << beautify(v) << std::endl;
+    bvt two_to_power_of_two_to_power_of_n_plus_1 = build_constant(1, x.size());
+    two_to_power_of_two_to_power_of_n_plus_1[two_to_power_of_n] =
+      const_literal(true);
+    const bvt u_ext = select(
+      unsigned_less_than(u, v),
+      add(u, two_to_power_of_two_to_power_of_n_plus_1),
+      u);
+    // std::cerr << "u_ext: " << beautify(u_ext) << std::endl;
+    return sub(u_ext, v);
+  };
+
+  std::vector<bvt> a_hat_k{num_chunks, bvt{}}, b_hat_k{num_chunks, bvt{}};
+  // Reduce by F_n
+  for(std::size_t k = 1; k < num_chunks; k += 2)
+  {
+    a_hat_k[k] = reduce_to_mod_F_n(Aa[k], n);
+    std::cerr << "a_hat_k[" << k << "]: " << beautify(a_hat_k[k]) << std::endl;
+    b_hat_k[k] = reduce_to_mod_F_n(Ab[k], n);
+    std::cerr << "b_hat_k[" << k << "]: " << beautify(b_hat_k[k]) << std::endl;
+  }
+
+  // Compute point-wise multiplication
+  std::vector<bvt> c_hat_k{num_chunks, bvt{}};
+  for(std::size_t k = 1; k < num_chunks; k += 2)
+  {
+    // If at least one of a_hat_k[k] or b_hat_k[k] is 2^2^n (i.e., F_n - 1) then
+    // multiplication would overflow, so handle those cases separately:
+    // x * 2^2^n = x * -1 (mod F_n) which can be computed by rotating x by 2^n
+    const std::size_t power_2_n = (std::size_t)1 << n;
+    c_hat_k[k] = select(
+      a_hat_k[k][power_2_n],
+      shift(b_hat_k[k], shiftt::ROTATE_LEFT, power_2_n),
+      select(
+        b_hat_k[k][power_2_n],
+        shift(a_hat_k[k], shiftt::ROTATE_LEFT, power_2_n),
+        unsigned_multiplier(a_hat_k[k], b_hat_k[k])));
+    std::cerr << "c_hat_k[" << k << "]: " << beautify(c_hat_k[k]) << std::endl;
+  }
+
+  // Apply inverse NTT
+  for(std::size_t nu = address_bits(num_chunks) - 1; nu > 0; --nu)
+  {
+    const std::size_t bit_nu = (std::size_t)1 << nu;
+    std::size_t bits_up_to_nu = 0;
+    for(std::size_t i = 0; i < nu; ++i)
+      bits_up_to_nu |= 1 << i;
+
+    // we only need odd ones
+    for(std::size_t k = 1; k < num_chunks; k += 2)
+    {
+      if((k & bit_nu) == 0)
+        continue;
+
+      bvt c_hat_k_nu_bit_is_zero = c_hat_k[k & ~bit_nu];
+
+      c_hat_k[k & ~bit_nu] = shift(
+        cyclic_add(c_hat_k_nu_bit_is_zero, c_hat_k[k]),
+        shiftt::ROTATE_RIGHT,
+        1);
+      std::cerr << "c_hat_k[" << nu << "](" << (k & ~bit_nu)
+                << "): " << beautify(c_hat_k[k & ~bit_nu]) << std::endl;
+
+      const std::size_t chi = (k & bits_up_to_nu)
+                              << (address_bits(num_chunks) - 1 - nu);
+      const std::size_t omega = m % 2 == 1 ? 2 : 4;
+      const std::size_t shift_dist = chi * omega / 2 + 1;
+      std::cerr << "SHIFT: " << shift_dist << std::endl;
+
+      c_hat_k[k] = shift(
+        cyclic_add(
+          c_hat_k_nu_bit_is_zero,
+          shift(c_hat_k[k], shiftt::ROTATE_LEFT, (std::size_t)1 << n)),
+        shiftt::ROTATE_RIGHT,
+        shift_dist);
+      std::cerr << "c_hat_k[" << nu << "](" << k
+                << "): " << beautify(c_hat_k[k]) << std::endl;
+    }
+  }
+  // Reduce by F_n
+  std::vector<bvt> z_j_mod_F_n;
+  z_j_mod_F_n.reserve(num_chunks / 2);
+  for(std::size_t j = 0; j < num_chunks / 2; ++j)
+  {
+    // reverse n - 1 (n - 2 if m is even) bits of j
+    std::size_t k = 0;
+    for(std::size_t nu = 0; nu < address_bits(num_chunks) - 1; ++nu)
+    {
+      k |= (j & (1 << nu)) >> nu;
+      k <<= 1;
+    }
+    k |= 1;
+    std::cerr << "j " << j << " maps to " << k << std::endl;
+    // TODO: we could add a capability to reduce_to_mod_F_n to restrict the
+    // result to op_size bits so that this call (but not the ones above) could
+    // use it.
+    z_j_mod_F_n.push_back(reduce_to_mod_F_n(c_hat_k[k], n));
+    std::cerr << "z_j_mod_F_n[" << j << "]: " << beautify(z_j_mod_F_n[j])
+              << std::endl;
+  }
+
+  // Compute final coefficients as xi + delta * F_n where delta = eta - xi (mod
+  // 2^(n + 2) for odd m and 2^(n + 1) for even m) for eta being z_j and xi
+  // being z_j_mod_F_n.
+#if 0
+  // To compute the full-width result
+  const std::size_t result_size = two_to_m_plus_1;
+#else
+  const std::size_t result_size = op_size;
+#endif
+  for(std::size_t j = 0; j < num_chunks / 2; ++j)
+  {
+    const bvt &eta = z_j[j];
+    std::cerr << "eta[" << j << "]: " << beautify(eta) << std::endl;
+    const bvt &xi = z_j_mod_F_n[j];
+    std::cerr << "xi[" << j << "]: " << beautify(xi) << std::endl;
+    PRECONDITION(eta.size() == address_bits(num_chunks) + 1);
+    bvt xi_2n_2{xi.begin(), xi.begin() + eta.size()};
+    bvt delta = sub(eta, xi_2n_2);
+    PRECONDITION(delta.size() <= result_size);
+    std::cerr << "delta[" << j << "]: " << beautify(delta) << std::endl;
+    bvt delta_times_F_n = add(
+      shift(
+        zero_extension(delta, result_size),
+        shiftt::SHIFT_LEFT,
+        (std::size_t)1 << n),
+      zero_extension(delta, result_size));
+    std::cerr << "delta * F_n: " << beautify(delta_times_F_n) << std::endl;
+    if(xi.size() > result_size)
+    {
+      bvt xi_result_size{xi.begin(), xi.begin() + result_size};
+      z_j[j] = add(xi_result_size, delta_times_F_n);
+    }
+    else
+    {
+      z_j[j] = add(zero_extension(xi, result_size), delta_times_F_n);
+    }
+    std::cerr << "z_j[" << j << "]: " << beautify(z_j[j]) << std::endl;
+  }
+
+  bvt result = zeros(result_size);
+  for(std::size_t j = 0; j < num_chunks / 2; ++j)
+  {
+    if(chunk_size * j >= result_size)
+      break;
+    result = add(result, shift(z_j[j], shiftt::SHIFT_LEFT, chunk_size * j));
+  }
+  std::cerr << "result: " << beautify(result) << std::endl;
+  CHECK_RETURN(result.size() >= op_size);
+  result.resize(op_size);
+  std::cerr << "result resized: " << beautify(result) << std::endl;
+
+  return result;
+}
+
 bvt bv_utilst::unsigned_multiplier_no_overflow(
   const bvt &op0,
   const bvt &op1)
@@ -1005,13 +2669,21 @@ bvt bv_utilst::signed_multiplier(const bvt &op0, const bvt &op1)
   if(op0.empty() || op1.empty())
     return bvt();
 
-  literalt sign0=op0[op0.size()-1];
-  literalt sign1=op1[op1.size()-1];
+  literalt sign0 = sign_bit(op0);
+  literalt sign1 = sign_bit(op1);
 
   bvt neg0=cond_negate(op0, sign0);
   bvt neg1=cond_negate(op1, sign1);
 
+#ifdef USE_KARATSUBA
+  bvt result = unsigned_karatsuba_multiplier(neg0, neg1);
+#elif defined(USE_TOOM_COOK)
+  bvt result = unsigned_toom_cook_multiplier(neg0, neg1);
+#elif defined(USE_SCHOENHAGE_STRASSEN)
+  bvt result = unsigned_schoenhage_strassen_multiplier(neg0, neg1);
+#else
   bvt result=unsigned_multiplier(neg0, neg1);
+#endif
 
   literalt result_sign=prop.lxor(sign0, sign1);
 
@@ -1034,7 +2706,7 @@ bvt bv_utilst::cond_negate(const bvt &bv, const literalt cond)
 bvt bv_utilst::absolute_value(const bvt &bv)
 {
   PRECONDITION(!bv.empty());
-  return cond_negate(bv, bv[bv.size()-1]);
+  return cond_negate(bv, sign_bit(bv));
 }
 
 bvt bv_utilst::cond_negate_no_overflow(const bvt &bv, literalt cond)
@@ -1051,15 +2723,15 @@ bvt bv_utilst::signed_multiplier_no_overflow(
   if(op0.empty() || op1.empty())
     return bvt();
 
-  literalt sign0=op0[op0.size()-1];
-  literalt sign1=op1[op1.size()-1];
+  literalt sign0 = sign_bit(op0);
+  literalt sign1 = sign_bit(op1);
 
   bvt neg0=cond_negate_no_overflow(op0, sign0);
   bvt neg1=cond_negate_no_overflow(op1, sign1);
 
   bvt result=unsigned_multiplier_no_overflow(neg0, neg1);
 
-  prop.l_set_to_false(result[result.size() - 1]);
+  prop.l_set_to_false(sign_bit(result));
 
   literalt result_sign=prop.lxor(sign0, sign1);
 
@@ -1079,7 +2751,18 @@ bvt bv_utilst::multiplier(
   switch(rep)
   {
   case representationt::SIGNED: return signed_multiplier(op0, op1);
+#ifdef USE_KARATSUBA
+  case representationt::UNSIGNED:
+    return unsigned_karatsuba_multiplier(op0, op1);
+#elif defined(USE_TOOM_COOK)
+  case representationt::UNSIGNED:
+    return unsigned_toom_cook_multiplier(op0, op1);
+#elif defined(USE_SCHOENHAGE_STRASSEN)
+  case representationt::UNSIGNED:
+    return unsigned_schoenhage_strassen_multiplier(op0, op1);
+#else
   case representationt::UNSIGNED: return unsigned_multiplier(op0, op1);
+#endif
   }
 
   UNREACHABLE;
@@ -1112,8 +2795,8 @@ void bv_utilst::signed_divider(
 
   bvt _op0(op0), _op1(op1);
 
-  literalt sign_0=_op0[_op0.size()-1];
-  literalt sign_1=_op1[_op1.size()-1];
+  literalt sign_0 = sign_bit(_op0);
+  literalt sign_1 = sign_bit(_op1);
 
   bvt neg_0=negate(_op0), neg_1=negate(_op1);
 
@@ -1409,19 +3092,13 @@ literalt bv_utilst::lt_or_le(
 {
   PRECONDITION(bv0.size() == bv1.size());
 
-  literalt top0=bv0[bv0.size()-1],
-    top1=bv1[bv1.size()-1];
-
 #ifdef COMPACT_LT_OR_LE
   if(prop.has_set_to() && prop.cnf_handled_well())
   {
-    bvt compareBelow;   // 1 if a compare is needed below this bit
-    literalt result;
-    size_t start;
-    size_t i;
-
     if(rep == representationt::SIGNED)
     {
+      literalt top0 = sign_bit(bv0), top1 = sign_bit(bv1);
+
       if(top0.is_false() && top1.is_true())
         return const_literal(false);
       else if(top0.is_true() && top1.is_false())
@@ -1429,8 +3106,9 @@ literalt bv_utilst::lt_or_le(
 
       INVARIANT(
         bv0.size() >= 2, "signed bitvectors should have at least two bits");
-      compareBelow = prop.new_variables(bv0.size() - 1);
-      start = compareBelow.size() - 1;
+      // 1 if a compare is needed below this bit
+      bvt compareBelow = prop.new_variables(bv0.size() - 1);
+      size_t start = compareBelow.size() - 1;
 
       literalt &firstComp = compareBelow[start];
       if(top0.is_false())
@@ -1442,7 +3120,7 @@ literalt bv_utilst::lt_or_le(
       else if(top1.is_true())
         firstComp = top0;
 
-      result = prop.new_variable();
+      literalt result = prop.new_variable();
 
       // When comparing signs we are comparing the top bit
       // Four cases...
@@ -1454,77 +3132,137 @@ literalt bv_utilst::lt_or_le(
 #ifdef INCLUDE_REDUNDANT_CLAUSES
       prop.lcnf(top0, !top1, !firstComp);
       prop.lcnf(!top0,  top1, !firstComp);
-#endif
-    }
-    else
-    {
-      // Unsigned is much easier
-      compareBelow = prop.new_variables(bv0.size() - 1);
-      compareBelow.push_back(const_literal(true));
-      start = compareBelow.size() - 1;
-      result = prop.new_variable();
-    }
+#  endif
 
-    // Determine the output
-    //  \forall i .  cb[i] & -a[i] &  b[i] =>  result
-    //  \forall i .  cb[i] &  a[i] & -b[i] => -result
-    i = start;
-    do
-    {
-      if(compareBelow[i].is_false())
-        continue;
-      else if(compareBelow[i].is_true())
+      // Determine the output
+      //  \forall i .  cb[i] & -a[i] &  b[i] =>  result
+      //  \forall i .  cb[i] &  a[i] & -b[i] => -result
+      size_t i = start;
+      do
+      {
+        if(compareBelow[i].is_false())
+          continue;
+
+        prop.lcnf(!compareBelow[i], bv0[i], !bv1[i], result);
+        prop.lcnf(!compareBelow[i], !bv0[i], bv1[i], !result);
+      } while(i-- != 0);
+
+      // Chain the comparison bit
+      //  \forall i != 0 . cb[i] &  a[i] &  b[i] => cb[i-1]
+      //  \forall i != 0 . cb[i] & -a[i] & -b[i] => cb[i-1]
+      for(i = start; i > 0; i--)
       {
-        if(bv0[i].is_false() && bv1[i].is_true())
-          return const_literal(true);
-        else if(bv0[i].is_true() && bv1[i].is_false())
-          return const_literal(false);
+        prop.lcnf(!compareBelow[i], !bv0[i], !bv1[i], compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], bv0[i], bv1[i], compareBelow[i - 1]);
       }
 
-      prop.lcnf(!compareBelow[i],  bv0[i], !bv1[i],  result);
-      prop.lcnf(!compareBelow[i], !bv0[i],  bv1[i], !result);
-    }
-    while(i-- != 0);
+#  ifdef INCLUDE_REDUNDANT_CLAUSES
+      // Optional zeroing of the comparison bit when not needed
+      //  \forall i != 0 . -c[i] => -c[i-1]
+      //  \forall i != 0 .  c[i] & -a[i] &  b[i] => -c[i-1]
+      //  \forall i != 0 .  c[i] &  a[i] & -b[i] => -c[i-1]
+      for(i = start; i > 0; i--)
+      {
+        prop.lcnf(compareBelow[i], !compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], bv0[i], !bv1[i], !compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], !bv0[i], bv1[i], !compareBelow[i - 1]);
+      }
+#  endif
 
-    // Chain the comparison bit
-    //  \forall i != 0 . cb[i] &  a[i] &  b[i] => cb[i-1]
-    //  \forall i != 0 . cb[i] & -a[i] & -b[i] => cb[i-1]
-    for(i = start; i > 0; i--)
-    {
-      prop.lcnf(!compareBelow[i], !bv0[i], !bv1[i], compareBelow[i-1]);
-      prop.lcnf(!compareBelow[i],  bv0[i],  bv1[i], compareBelow[i-1]);
-    }
+      // The 'base case' of the induction is the case when they are equal
+      prop.lcnf(
+        !compareBelow[0], !bv0[0], !bv1[0], (or_equal) ? result : !result);
+      prop.lcnf(
+        !compareBelow[0], bv0[0], bv1[0], (or_equal) ? result : !result);
 
+      return result;
+    }
+    else
+    {
+      // Unsigned is much easier
+      // 1 if a compare is needed below this bit
+      bvt compareBelow;
+      literalt result;
+      size_t start = bv0.size() - 1;
+
+      // Determine the output
+      //  \forall i .  cb[i] & -a[i] &  b[i] =>  result
+      //  \forall i .  cb[i] &  a[i] & -b[i] => -result
+      bool same_prefix = true;
+      size_t i = start;
+      do
+      {
+        if(same_prefix)
+        {
+          if(i == 0)
+          {
+            if(or_equal)
+              return prop.lor(!bv0[0], bv1[0]);
+            else
+              return prop.land(!bv0[0], bv1[0]);
+          }
+          else if(bv0[i] == bv1[i])
+            continue;
+          else if(bv0[i].is_false() && bv1[i].is_true())
+            return const_literal(true);
+          else if(bv0[i].is_true() && bv1[i].is_false())
+            return const_literal(false);
+          else
+          {
+            same_prefix = false;
+            start = i;
+            compareBelow = prop.new_variables(i);
+            compareBelow.push_back(const_literal(true));
+            result = prop.new_variable();
+          }
+        }
+
+        prop.lcnf(!compareBelow[i], bv0[i], !bv1[i], result);
+        prop.lcnf(!compareBelow[i], !bv0[i], bv1[i], !result);
+      } while(i-- != 0);
+
+      // Chain the comparison bit
+      //  \forall i != 0 . cb[i] &  a[i] &  b[i] => cb[i-1]
+      //  \forall i != 0 . cb[i] & -a[i] & -b[i] => cb[i-1]
+      for(i = start; i > 0; i--)
+      {
+        prop.lcnf(!compareBelow[i], !bv0[i], !bv1[i], compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], bv0[i], bv1[i], compareBelow[i - 1]);
+      }
 
 #ifdef INCLUDE_REDUNDANT_CLAUSES
-    // Optional zeroing of the comparison bit when not needed
-    //  \forall i != 0 . -c[i] => -c[i-1]
-    //  \forall i != 0 .  c[i] & -a[i] &  b[i] => -c[i-1]
-    //  \forall i != 0 .  c[i] &  a[i] & -b[i] => -c[i-1]
-    for(i = start; i > 0; i--)
-    {
-      prop.lcnf(compareBelow[i],                   !compareBelow[i-1]);
-      prop.lcnf(!compareBelow[i],  bv0[i], !bv1[i], !compareBelow[i-1]);
-      prop.lcnf(!compareBelow[i], !bv0[i],  bv1[i], !compareBelow[i-1]);
-    }
+      // Optional zeroing of the comparison bit when not needed
+      //  \forall i != 0 . -c[i] => -c[i-1]
+      //  \forall i != 0 .  c[i] & -a[i] &  b[i] => -c[i-1]
+      //  \forall i != 0 .  c[i] &  a[i] & -b[i] => -c[i-1]
+      for(i = start; i > 0; i--)
+      {
+        prop.lcnf(compareBelow[i], !compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], bv0[i], !bv1[i], !compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], !bv0[i], bv1[i], !compareBelow[i - 1]);
+      }
 #endif
 
-    // The 'base case' of the induction is the case when they are equal
-    prop.lcnf(!compareBelow[0], !bv0[0], !bv1[0], (or_equal)?result:!result);
-    prop.lcnf(!compareBelow[0],  bv0[0],  bv1[0], (or_equal)?result:!result);
+      // The 'base case' of the induction is the case when they are equal
+      prop.lcnf(
+        !compareBelow[0], !bv0[0], !bv1[0], (or_equal) ? result : !result);
+      prop.lcnf(
+        !compareBelow[0], bv0[0], bv1[0], (or_equal) ? result : !result);
 
-    return result;
+      return result;
+    }
   }
   else
 #endif
   {
+    // A <= B  iff  there is an overflow on A-B
     literalt carry=
       carry_out(bv0, inverted(bv1), const_literal(true));
 
     literalt result;
 
     if(rep==representationt::SIGNED)
-      result=prop.lxor(prop.lequal(top0, top1), carry);
+      result = prop.lxor(prop.lequal(sign_bit(bv0), sign_bit(bv1)), carry);
     else
     {
       INVARIANT(
@@ -1544,12 +3282,7 @@ literalt bv_utilst::unsigned_less_than(
   const bvt &op0,
   const bvt &op1)
 {
-#ifdef COMPACT_LT_OR_LE
   return lt_or_le(false, op0, op1, representationt::UNSIGNED);
-#else
-  // A <= B  iff  there is an overflow on A-B
-  return !carry_out(op0, inverted(op1), const_literal(true));
-#endif
 }
 
 literalt bv_utilst::signed_less_than(
@@ -1644,3 +3377,63 @@ bvt bv_utilst::verilog_bv_normal_bits(const bvt &src)
 
   return even_bits;
 }
+
+/// Symbolic implementation of popcount (count of 1 bits in a bit vector)
+/// Based on the pop0 algorithm from Hacker's Delight
+/// \param bv: The bit vector to count 1s in
+/// \return A bit vector representing the count
+bvt bv_utilst::popcount(const bvt &bv)
+{
+  PRECONDITION(!bv.empty());
+
+  // Determine the result width: log2(bv.size()) + 1
+  std::size_t log2 = address_bits(bv.size());
+  CHECK_RETURN(log2 >= 1);
+
+  // Start with the original bit vector
+  bvt x = bv;
+
+  // Apply the parallel bit counting algorithm from Hacker's Delight (pop0).
+  // The algorithm works by summing adjacent bit groups of increasing sizes.
+
+  // Iterate through the stages of the algorithm, doubling the field size each time
+  for(std::size_t stage = 0; stage < log2; ++stage)
+  {
+    std::size_t shift_amount = 1 << stage;     // 1, 2, 4, 8, 16, ...
+    std::size_t field_size = 2 * shift_amount; // 2, 4, 8, 16, 32, ...
+
+    // Skip if the bit vector is smaller than the field size
+    if(x.size() <= shift_amount)
+      break;
+
+    // Shift the bit vector
+    bvt x_shifted = shift(x, shiftt::SHIFT_LRIGHT, shift_amount);
+
+    // Create a mask with 'shift_amount' ones followed by 'shift_amount' zeros, repeated
+    bvt mask;
+    mask.reserve(x.size());
+    for(std::size_t i = 0; i < x.size(); i++)
+    {
+      if((i % field_size) < shift_amount)
+        mask.push_back(const_literal(true));
+      else
+        mask.push_back(const_literal(false));
+    }
+
+    // Apply the mask to both the original and shifted bit vectors
+    bvt masked_x, masked_shifted;
+    masked_x.reserve(x.size());
+    masked_shifted.reserve(x.size());
+
+    for(std::size_t i = 0; i < x.size(); i++)
+    {
+      masked_x.push_back(prop.land(x[i], mask[i]));
+      masked_shifted.push_back(prop.land(x_shifted[i], mask[i]));
+    }
+
+    // Add the masked vectors
+    x = add(masked_x, masked_shifted);
+  }
+
+  return x;
+}
diff --git a/src/solvers/flattening/bv_utils.h b/src/solvers/flattening/bv_utils.h
index a9a195257a6..58de3dff4ff 100644
--- a/src/solvers/flattening/bv_utils.h
+++ b/src/solvers/flattening/bv_utils.h
@@ -79,6 +79,10 @@ class bv_utilst
   bvt shift(const bvt &op, const shiftt shift, const bvt &distance);
 
   bvt unsigned_multiplier(const bvt &op0, const bvt &op1);
+  bvt unsigned_karatsuba_multiplier(const bvt &op0, const bvt &op1);
+  bvt unsigned_karatsuba_full_multiplier(const bvt &op0, const bvt &op1);
+  bvt unsigned_toom_cook_multiplier(const bvt &op0, const bvt &op1);
+  bvt unsigned_schoenhage_strassen_multiplier(const bvt &a, const bvt &b);
   bvt signed_multiplier(const bvt &op0, const bvt &op1);
   bvt multiplier(const bvt &op0, const bvt &op1, representationt rep);
   bvt multiplier_no_overflow(
@@ -137,7 +141,7 @@ class bv_utilst
 
   static inline literalt sign_bit(const bvt &op)
   {
-    return op[op.size()-1];
+    return op.back();
   }
 
   literalt is_zero(const bvt &op)
@@ -218,6 +222,12 @@ class bv_utilst
   literalt verilog_bv_has_x_or_z(const bvt &);
   static bvt verilog_bv_normal_bits(const bvt &);
 
+  /// Symbolic implementation of popcount (count of 1 bits in a bit vector)
+  /// Based on the pop0 algorithm from Hacker's Delight
+  /// \param bv: The bit vector to count 1s in
+  /// \return A bit vector representing the count
+  bvt popcount(const bvt &bv);
+
 protected:
   propt &prop;
 
@@ -244,6 +254,7 @@ class bv_utilst
 
   bvt wallace_tree(const std::vector<bvt> &pps);
   bvt dadda_tree(const std::vector<bvt> &pps);
+  bvt comba_column_wise(const std::vector<bvt> &pps);
 };
 
 #endif // CPROVER_SOLVERS_FLATTENING_BV_UTILS_H