From b1e854333f2884307c5a66729a731d319f93d98b Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Mon, 14 Jul 2025 10:25:32 +0000
Subject: [PATCH 01/13] Use bv_utilst::sign_bit

We were still repeatedly using `bv[bv.size() - 1]` in place, when using
`sign_bit(bv)` adds clarity and avoids having to understand encoding
details. Also, use `.back()` to avoid unnecessary repeat arithmetic
(which the compiler may or may not optimise away).
---
 src/solvers/flattening/boolbv.cpp   |  2 +-
 src/solvers/flattening/bv_utils.cpp | 36 ++++++++++++++---------------
 src/solvers/flattening/bv_utils.h   |  2 +-
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/src/solvers/flattening/boolbv.cpp b/src/solvers/flattening/boolbv.cpp
index b94cd6db0c3..633c7f0baee 100644
--- a/src/solvers/flattening/boolbv.cpp
+++ b/src/solvers/flattening/boolbv.cpp
@@ -407,7 +407,7 @@ literalt boolbvt::convert_rest(const exprt &expr)
     CHECK_RETURN(!bv.empty());
     const irep_idt type_id = op.type().id();
     if(type_id == ID_signedbv || type_id == ID_fixedbv || type_id == ID_floatbv)
-      return bv[bv.size()-1];
+      return bv_utils.sign_bit(bv);
     if(type_id == ID_unsignedbv)
       return const_literal(false);
   }
diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index 31dd12fea20..7b7933d21db 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -432,12 +432,11 @@ literalt bv_utilst::overflow_add(
     // An overflow occurs if the signs of the two operands are the same
     // and the sign of the sum is the opposite.
 
-    literalt old_sign=op0[op0.size()-1];
-    literalt sign_the_same=prop.lequal(op0[op0.size()-1], op1[op1.size()-1]);
+    literalt old_sign = sign_bit(op0);
+    literalt sign_the_same = prop.lequal(sign_bit(op0), sign_bit(op1));
 
     bvt result=add(op0, op1);
-    return
-      prop.land(sign_the_same, prop.lxor(result[result.size()-1], old_sign));
+    return prop.land(sign_the_same, prop.lxor(sign_bit(result), old_sign));
   }
   else if(rep==representationt::UNSIGNED)
   {
@@ -457,7 +456,7 @@ literalt bv_utilst::overflow_sub(
     // x is negative, always representable, and
     // thus not an overflow.
     literalt op1_is_int_min=is_int_min(op1);
-    literalt op0_is_negative=op0[op0.size()-1];
+    literalt op0_is_negative = sign_bit(op0);
 
     return
       prop.lselect(op1_is_int_min,
@@ -557,7 +556,7 @@ bvt bv_utilst::shift(const bvt &src, const shiftt s, std::size_t dist)
     case shiftt::SHIFT_ARIGHT:
       // src.size()-i won't underflow as i<src.size()
       // Then, if dist<src.size()-i, then i+dist<src.size()
-      l=(dist<src.size()-i?src[i+dist]:src[src.size()-1]); // sign bit
+      l = dist < src.size() - i ? src[i + dist] : sign_bit(src);
       break;
 
     case shiftt::SHIFT_LRIGHT:
@@ -608,7 +607,7 @@ literalt bv_utilst::overflow_negate(const bvt &bv)
   bvt should_be_zeros(bv);
   should_be_zeros.pop_back();
 
-  return prop.land(bv[bv.size() - 1], !prop.lor(should_be_zeros));
+  return prop.land(sign_bit(bv), !prop.lor(should_be_zeros));
 }
 
 void bv_utilst::incrementer(
@@ -1005,8 +1004,8 @@ bvt bv_utilst::signed_multiplier(const bvt &op0, const bvt &op1)
   if(op0.empty() || op1.empty())
     return bvt();
 
-  literalt sign0=op0[op0.size()-1];
-  literalt sign1=op1[op1.size()-1];
+  literalt sign0 = sign_bit(op0);
+  literalt sign1 = sign_bit(op1);
 
   bvt neg0=cond_negate(op0, sign0);
   bvt neg1=cond_negate(op1, sign1);
@@ -1034,7 +1033,7 @@ bvt bv_utilst::cond_negate(const bvt &bv, const literalt cond)
 bvt bv_utilst::absolute_value(const bvt &bv)
 {
   PRECONDITION(!bv.empty());
-  return cond_negate(bv, bv[bv.size()-1]);
+  return cond_negate(bv, sign_bit(bv));
 }
 
 bvt bv_utilst::cond_negate_no_overflow(const bvt &bv, literalt cond)
@@ -1051,15 +1050,15 @@ bvt bv_utilst::signed_multiplier_no_overflow(
   if(op0.empty() || op1.empty())
     return bvt();
 
-  literalt sign0=op0[op0.size()-1];
-  literalt sign1=op1[op1.size()-1];
+  literalt sign0 = sign_bit(op0);
+  literalt sign1 = sign_bit(op1);
 
   bvt neg0=cond_negate_no_overflow(op0, sign0);
   bvt neg1=cond_negate_no_overflow(op1, sign1);
 
   bvt result=unsigned_multiplier_no_overflow(neg0, neg1);
 
-  prop.l_set_to_false(result[result.size() - 1]);
+  prop.l_set_to_false(sign_bit(result));
 
   literalt result_sign=prop.lxor(sign0, sign1);
 
@@ -1112,8 +1111,8 @@ void bv_utilst::signed_divider(
 
   bvt _op0(op0), _op1(op1);
 
-  literalt sign_0=_op0[_op0.size()-1];
-  literalt sign_1=_op1[_op1.size()-1];
+  literalt sign_0 = sign_bit(_op0);
+  literalt sign_1 = sign_bit(_op1);
 
   bvt neg_0=negate(_op0), neg_1=negate(_op1);
 
@@ -1409,9 +1408,6 @@ literalt bv_utilst::lt_or_le(
 {
   PRECONDITION(bv0.size() == bv1.size());
 
-  literalt top0=bv0[bv0.size()-1],
-    top1=bv1[bv1.size()-1];
-
 #ifdef COMPACT_LT_OR_LE
   if(prop.has_set_to() && prop.cnf_handled_well())
   {
@@ -1422,6 +1418,8 @@ literalt bv_utilst::lt_or_le(
 
     if(rep == representationt::SIGNED)
     {
+      literalt top0 = sign_bit(bv0), top1 = sign_bit(bv1);
+
       if(top0.is_false() && top1.is_true())
         return const_literal(false);
       else if(top0.is_true() && top1.is_false())
@@ -1524,7 +1522,7 @@ literalt bv_utilst::lt_or_le(
     literalt result;
 
     if(rep==representationt::SIGNED)
-      result=prop.lxor(prop.lequal(top0, top1), carry);
+      result = prop.lxor(prop.lequal(sign_bit(bv0), sign_bit(bv1)), carry);
     else
     {
       INVARIANT(
diff --git a/src/solvers/flattening/bv_utils.h b/src/solvers/flattening/bv_utils.h
index a9a195257a6..dd5dfc67451 100644
--- a/src/solvers/flattening/bv_utils.h
+++ b/src/solvers/flattening/bv_utils.h
@@ -137,7 +137,7 @@ class bv_utilst
 
   static inline literalt sign_bit(const bvt &op)
   {
-    return op[op.size()-1];
+    return op.back();
   }
 
   literalt is_zero(const bvt &op)

From 46edc070ea5c3925709f258a5dec3733e248b2e5 Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Tue, 8 Jul 2025 14:53:11 +0200
Subject: [PATCH 02/13] Implement popcount in flattening back-end

We can avoid lowering, and eventually re-use this as part of other
algorithms.
---
 src/solvers/Makefile                       |  1 +
 src/solvers/flattening/boolbv.cpp          |  2 +-
 src/solvers/flattening/boolbv.h            |  2 +
 src/solvers/flattening/boolbv_popcount.cpp | 20 +++++++
 src/solvers/flattening/bv_utils.cpp        | 62 ++++++++++++++++++++++
 src/solvers/flattening/bv_utils.h          |  6 +++
 6 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 src/solvers/flattening/boolbv_popcount.cpp

diff --git a/src/solvers/Makefile b/src/solvers/Makefile
index 71c7a01b3ed..d8c790ae8db 100644
--- a/src/solvers/Makefile
+++ b/src/solvers/Makefile
@@ -118,6 +118,7 @@ SRC = $(BOOLEFORCE_SRC) \
       flattening/boolbv_onehot.cpp \
       flattening/boolbv_overflow.cpp \
       flattening/boolbv_power.cpp \
+      flattening/boolbv_popcount.cpp \
       flattening/boolbv_quantifier.cpp \
       flattening/boolbv_reduction.cpp \
       flattening/boolbv_replication.cpp \
diff --git a/src/solvers/flattening/boolbv.cpp b/src/solvers/flattening/boolbv.cpp
index 633c7f0baee..16c034702de 100644
--- a/src/solvers/flattening/boolbv.cpp
+++ b/src/solvers/flattening/boolbv.cpp
@@ -235,7 +235,7 @@ bvt boolbvt::convert_bitvector(const exprt &expr)
   else if(expr.id()==ID_power)
      return convert_power(to_binary_expr(expr));
   else if(expr.id() == ID_popcount)
-    return convert_bv(simplify_expr(to_popcount_expr(expr).lower(), ns));
+     return convert_popcount(to_popcount_expr(expr));
   else if(expr.id() == ID_count_leading_zeros)
   {
     return convert_bv(
diff --git a/src/solvers/flattening/boolbv.h b/src/solvers/flattening/boolbv.h
index 62d2703ee10..c305b555ee0 100644
--- a/src/solvers/flattening/boolbv.h
+++ b/src/solvers/flattening/boolbv.h
@@ -37,6 +37,7 @@ class floatbv_round_to_integral_exprt;
 class floatbv_typecast_exprt;
 class ieee_float_op_exprt;
 class overflow_result_exprt;
+class popcount_exprt;
 class replication_exprt;
 class unary_overflow_exprt;
 class union_typet;
@@ -203,6 +204,7 @@ class boolbvt:public arrayst
   virtual bvt convert_bitreverse(const bitreverse_exprt &expr);
   virtual bvt convert_saturating_add_sub(const binary_exprt &expr);
   virtual bvt convert_overflow_result(const overflow_result_exprt &expr);
+  virtual bvt convert_popcount(const popcount_exprt &expr);
 
   bvt convert_update_bits(bvt src, const exprt &index, const bvt &new_value);
 
diff --git a/src/solvers/flattening/boolbv_popcount.cpp b/src/solvers/flattening/boolbv_popcount.cpp
new file mode 100644
index 00000000000..95a483facac
--- /dev/null
+++ b/src/solvers/flattening/boolbv_popcount.cpp
@@ -0,0 +1,20 @@
+/*******************************************************************\
+
+Module:
+
+Author: Michael Tautschnig
+
+\*******************************************************************/
+
+#include <util/bitvector_expr.h>
+
+#include "boolbv.h"
+
+bvt boolbvt::convert_popcount(const popcount_exprt &expr)
+{
+  const std::size_t width = boolbv_width(expr.type());
+
+  bvt op = convert_bv(expr.op());
+
+  return bv_utils.zero_extension(bv_utils.popcount(op), width);
+}
diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index 7b7933d21db..e527d32eb8b 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -8,6 +8,8 @@ Author: Daniel Kroening, kroening@kroening.com
 
 #include "bv_utils.h"
 
+#include <util/arith_tools.h>
+
 #include <list>
 #include <utility>
 
@@ -1642,3 +1644,63 @@ bvt bv_utilst::verilog_bv_normal_bits(const bvt &src)
 
   return even_bits;
 }
+
+/// Symbolic implementation of popcount (count of 1 bits in a bit vector)
+/// Based on the pop0 algorithm from Hacker's Delight
+/// \param bv: The bit vector to count 1s in
+/// \return A bit vector representing the count
+bvt bv_utilst::popcount(const bvt &bv)
+{
+  PRECONDITION(!bv.empty());
+
+  // Determine the result width: log2(bv.size()) + 1
+  std::size_t log2 = address_bits(bv.size());
+  CHECK_RETURN(log2 >= 1);
+
+  // Start with the original bit vector
+  bvt x = bv;
+
+  // Apply the parallel bit counting algorithm from Hacker's Delight (pop0).
+  // The algorithm works by summing adjacent bit groups of increasing sizes.
+
+  // Iterate through the stages of the algorithm, doubling the field size each time
+  for(std::size_t stage = 0; stage < log2; ++stage)
+  {
+    std::size_t shift_amount = 1 << stage;     // 1, 2, 4, 8, 16, ...
+    std::size_t field_size = 2 * shift_amount; // 2, 4, 8, 16, 32, ...
+
+    // Skip if the bit vector is smaller than the field size
+    if(x.size() <= shift_amount)
+      break;
+
+    // Shift the bit vector
+    bvt x_shifted = shift(x, shiftt::SHIFT_LRIGHT, shift_amount);
+
+    // Create a mask with 'shift_amount' ones followed by 'shift_amount' zeros, repeated
+    bvt mask;
+    mask.reserve(x.size());
+    for(std::size_t i = 0; i < x.size(); i++)
+    {
+      if((i % field_size) < shift_amount)
+        mask.push_back(const_literal(true));
+      else
+        mask.push_back(const_literal(false));
+    }
+
+    // Apply the mask to both the original and shifted bit vectors
+    bvt masked_x, masked_shifted;
+    masked_x.reserve(x.size());
+    masked_shifted.reserve(x.size());
+
+    for(std::size_t i = 0; i < x.size(); i++)
+    {
+      masked_x.push_back(prop.land(x[i], mask[i]));
+      masked_shifted.push_back(prop.land(x_shifted[i], mask[i]));
+    }
+
+    // Add the masked vectors
+    x = add(masked_x, masked_shifted);
+  }
+
+  return x;
+}
diff --git a/src/solvers/flattening/bv_utils.h b/src/solvers/flattening/bv_utils.h
index dd5dfc67451..60ba1422b56 100644
--- a/src/solvers/flattening/bv_utils.h
+++ b/src/solvers/flattening/bv_utils.h
@@ -218,6 +218,12 @@ class bv_utilst
   literalt verilog_bv_has_x_or_z(const bvt &);
   static bvt verilog_bv_normal_bits(const bvt &);
 
+  /// Symbolic implementation of popcount (count of 1 bits in a bit vector)
+  /// Based on the pop0 algorithm from Hacker's Delight
+  /// \param bv: The bit vector to count 1s in
+  /// \return A bit vector representing the count
+  bvt popcount(const bvt &bv);
+
 protected:
   propt &prop;
 

From 7e169f683e8a9223588212ec43811af8130551dd Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Mon, 14 Jul 2025 10:42:47 +0000
Subject: [PATCH 03/13] Improve bv_utilst less-than-or-less-or-equals

1. Duplicate some code to specialise it for signed/unsigned and,
   thereby, add clarity.
2. De-duplicate code to handle all cases directly in lt_or_le.
3. Make sure we constant-propagate whatever is possible in unsigned
   comparison to avoid introducing unnecessary fresh literals.
---
 src/solvers/flattening/bv_utils.cpp | 175 ++++++++++++++++++----------
 1 file changed, 113 insertions(+), 62 deletions(-)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index e527d32eb8b..a75a7bcbc70 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -1413,11 +1413,6 @@ literalt bv_utilst::lt_or_le(
 #ifdef COMPACT_LT_OR_LE
   if(prop.has_set_to() && prop.cnf_handled_well())
   {
-    bvt compareBelow;   // 1 if a compare is needed below this bit
-    literalt result;
-    size_t start;
-    size_t i;
-
     if(rep == representationt::SIGNED)
     {
       literalt top0 = sign_bit(bv0), top1 = sign_bit(bv1);
@@ -1429,8 +1424,9 @@ literalt bv_utilst::lt_or_le(
 
       INVARIANT(
         bv0.size() >= 2, "signed bitvectors should have at least two bits");
-      compareBelow = prop.new_variables(bv0.size() - 1);
-      start = compareBelow.size() - 1;
+      // 1 if a compare is needed below this bit
+      bvt compareBelow = prop.new_variables(bv0.size() - 1);
+      size_t start = compareBelow.size() - 1;
 
       literalt &firstComp = compareBelow[start];
       if(top0.is_false())
@@ -1442,7 +1438,7 @@ literalt bv_utilst::lt_or_le(
       else if(top1.is_true())
         firstComp = top0;
 
-      result = prop.new_variable();
+      literalt result = prop.new_variable();
 
       // When comparing signs we are comparing the top bit
       // Four cases...
@@ -1454,70 +1450,130 @@ literalt bv_utilst::lt_or_le(
 #ifdef INCLUDE_REDUNDANT_CLAUSES
       prop.lcnf(top0, !top1, !firstComp);
       prop.lcnf(!top0,  top1, !firstComp);
-#endif
-    }
-    else
-    {
-      // Unsigned is much easier
-      compareBelow = prop.new_variables(bv0.size() - 1);
-      compareBelow.push_back(const_literal(true));
-      start = compareBelow.size() - 1;
-      result = prop.new_variable();
-    }
+#  endif
 
-    // Determine the output
-    //  \forall i .  cb[i] & -a[i] &  b[i] =>  result
-    //  \forall i .  cb[i] &  a[i] & -b[i] => -result
-    i = start;
-    do
-    {
-      if(compareBelow[i].is_false())
-        continue;
-      else if(compareBelow[i].is_true())
+      // Determine the output
+      //  \forall i .  cb[i] & -a[i] &  b[i] =>  result
+      //  \forall i .  cb[i] &  a[i] & -b[i] => -result
+      size_t i = start;
+      do
       {
-        if(bv0[i].is_false() && bv1[i].is_true())
-          return const_literal(true);
-        else if(bv0[i].is_true() && bv1[i].is_false())
-          return const_literal(false);
+        if(compareBelow[i].is_false())
+          continue;
+
+        prop.lcnf(!compareBelow[i], bv0[i], !bv1[i], result);
+        prop.lcnf(!compareBelow[i], !bv0[i], bv1[i], !result);
+      } while(i-- != 0);
+
+      // Chain the comparison bit
+      //  \forall i != 0 . cb[i] &  a[i] &  b[i] => cb[i-1]
+      //  \forall i != 0 . cb[i] & -a[i] & -b[i] => cb[i-1]
+      for(i = start; i > 0; i--)
+      {
+        prop.lcnf(!compareBelow[i], !bv0[i], !bv1[i], compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], bv0[i], bv1[i], compareBelow[i - 1]);
       }
 
-      prop.lcnf(!compareBelow[i],  bv0[i], !bv1[i],  result);
-      prop.lcnf(!compareBelow[i], !bv0[i],  bv1[i], !result);
-    }
-    while(i-- != 0);
+#  ifdef INCLUDE_REDUNDANT_CLAUSES
+      // Optional zeroing of the comparison bit when not needed
+      //  \forall i != 0 . -c[i] => -c[i-1]
+      //  \forall i != 0 .  c[i] & -a[i] &  b[i] => -c[i-1]
+      //  \forall i != 0 .  c[i] &  a[i] & -b[i] => -c[i-1]
+      for(i = start; i > 0; i--)
+      {
+        prop.lcnf(compareBelow[i], !compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], bv0[i], !bv1[i], !compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], !bv0[i], bv1[i], !compareBelow[i - 1]);
+      }
+#  endif
 
-    // Chain the comparison bit
-    //  \forall i != 0 . cb[i] &  a[i] &  b[i] => cb[i-1]
-    //  \forall i != 0 . cb[i] & -a[i] & -b[i] => cb[i-1]
-    for(i = start; i > 0; i--)
-    {
-      prop.lcnf(!compareBelow[i], !bv0[i], !bv1[i], compareBelow[i-1]);
-      prop.lcnf(!compareBelow[i],  bv0[i],  bv1[i], compareBelow[i-1]);
-    }
+      // The 'base case' of the induction is the case when they are equal
+      prop.lcnf(
+        !compareBelow[0], !bv0[0], !bv1[0], (or_equal) ? result : !result);
+      prop.lcnf(
+        !compareBelow[0], bv0[0], bv1[0], (or_equal) ? result : !result);
 
+      return result;
+    }
+    else
+    {
+      // Unsigned is much easier
+      // 1 if a compare is needed below this bit
+      bvt compareBelow;
+      literalt result;
+      size_t start = bv0.size() - 1;
+
+      // Determine the output
+      //  \forall i .  cb[i] & -a[i] &  b[i] =>  result
+      //  \forall i .  cb[i] &  a[i] & -b[i] => -result
+      bool same_prefix = true;
+      size_t i = start;
+      do
+      {
+        if(same_prefix)
+        {
+          if(i == 0)
+          {
+            if(or_equal)
+              return prop.lor(!bv0[0], bv1[0]);
+            else
+              return prop.land(!bv0[0], bv1[0]);
+          }
+          else if(bv0[i] == bv1[i])
+            continue;
+          else if(bv0[i].is_false() && bv1[i].is_true())
+            return const_literal(true);
+          else if(bv0[i].is_true() && bv1[i].is_false())
+            return const_literal(false);
+          else
+          {
+            same_prefix = false;
+            start = i;
+            compareBelow = prop.new_variables(i);
+            compareBelow.push_back(const_literal(true));
+            result = prop.new_variable();
+          }
+        }
+
+        prop.lcnf(!compareBelow[i], bv0[i], !bv1[i], result);
+        prop.lcnf(!compareBelow[i], !bv0[i], bv1[i], !result);
+      } while(i-- != 0);
+
+      // Chain the comparison bit
+      //  \forall i != 0 . cb[i] &  a[i] &  b[i] => cb[i-1]
+      //  \forall i != 0 . cb[i] & -a[i] & -b[i] => cb[i-1]
+      for(i = start; i > 0; i--)
+      {
+        prop.lcnf(!compareBelow[i], !bv0[i], !bv1[i], compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], bv0[i], bv1[i], compareBelow[i - 1]);
+      }
 
 #ifdef INCLUDE_REDUNDANT_CLAUSES
-    // Optional zeroing of the comparison bit when not needed
-    //  \forall i != 0 . -c[i] => -c[i-1]
-    //  \forall i != 0 .  c[i] & -a[i] &  b[i] => -c[i-1]
-    //  \forall i != 0 .  c[i] &  a[i] & -b[i] => -c[i-1]
-    for(i = start; i > 0; i--)
-    {
-      prop.lcnf(compareBelow[i],                   !compareBelow[i-1]);
-      prop.lcnf(!compareBelow[i],  bv0[i], !bv1[i], !compareBelow[i-1]);
-      prop.lcnf(!compareBelow[i], !bv0[i],  bv1[i], !compareBelow[i-1]);
-    }
+      // Optional zeroing of the comparison bit when not needed
+      //  \forall i != 0 . -c[i] => -c[i-1]
+      //  \forall i != 0 .  c[i] & -a[i] &  b[i] => -c[i-1]
+      //  \forall i != 0 .  c[i] &  a[i] & -b[i] => -c[i-1]
+      for(i = start; i > 0; i--)
+      {
+        prop.lcnf(compareBelow[i], !compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], bv0[i], !bv1[i], !compareBelow[i - 1]);
+        prop.lcnf(!compareBelow[i], !bv0[i], bv1[i], !compareBelow[i - 1]);
+      }
 #endif
 
-    // The 'base case' of the induction is the case when they are equal
-    prop.lcnf(!compareBelow[0], !bv0[0], !bv1[0], (or_equal)?result:!result);
-    prop.lcnf(!compareBelow[0],  bv0[0],  bv1[0], (or_equal)?result:!result);
+      // The 'base case' of the induction is the case when they are equal
+      prop.lcnf(
+        !compareBelow[0], !bv0[0], !bv1[0], (or_equal) ? result : !result);
+      prop.lcnf(
+        !compareBelow[0], bv0[0], bv1[0], (or_equal) ? result : !result);
 
-    return result;
+      return result;
+    }
   }
   else
 #endif
   {
+    // A <= B  iff  there is an overflow on A-B
     literalt carry=
       carry_out(bv0, inverted(bv1), const_literal(true));
 
@@ -1544,12 +1600,7 @@ literalt bv_utilst::unsigned_less_than(
   const bvt &op0,
   const bvt &op1)
 {
-#ifdef COMPACT_LT_OR_LE
   return lt_or_le(false, op0, op1, representationt::UNSIGNED);
-#else
-  // A <= B  iff  there is an overflow on A-B
-  return !carry_out(op0, inverted(op1), const_literal(true));
-#endif
 }
 
 literalt bv_utilst::signed_less_than(

From 79f577ffb184346347fd5a184a7de382019b117b Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Sun, 11 Sep 2022 12:08:17 +0000
Subject: [PATCH 04/13] Higher radix multiplier encoding

The Radix-4 multiplier pre-computes products for x * 0 (zero), x * 1
(x), x * 2 (left-shift x), x * 3 (x * 2 + x) to reduce the number of
sums by a factor of 2. Radix-8 extends this up to x * 7 (reducing sums
by a factor of 3), and Radix-16 up to x * 15 (reducing sums by a factor
of 4). This modified approach to computing partial products can be
freely combined with different (tree) reduction schemes.

Benchmarking results can be found at
https://tinyurl.com/multiplier-comparison (shortened URL for
https://docs.google.com/spreadsheets/d/197uDKVXYRVAQdxB64wZCoWnoQ_TEpyEIw7K5ctJ7taM/).
The data suggests that combining Radix-8 partial product pre-computation
with Dadda's reduction can yield substantial performance gains while not
substantially regressing in other benchmark/solver pairs.
---
 src/solvers/flattening/bv_utils.cpp | 883 +++++++++++++++++++++++++++-
 1 file changed, 875 insertions(+), 8 deletions(-)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index a75a7bcbc70..66801a0cfc9 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -781,9 +781,10 @@ bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
 // been observed to go up by 5%-10%, and on some models even by 20%.
 // #define WALLACE_TREE
 // Dadda' reduction scheme. This yields a smaller formula size than Wallace
-// trees (and also the default addition scheme), but remains disabled as it
-// isn't consistently more performant either.
-// #define DADDA_TREE
+// trees (and also the default addition scheme), but isn't consistently more
+// performant with simple partial-product generation. Only when using
+// higher-radix multipliers the combination appears to perform better.
+#define DADDA_TREE
 
 // The following examples demonstrate the performance differences (with a
 // time-out of 7200 seconds):
@@ -917,16 +918,84 @@ bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
 // our multiplier that's not using a tree reduction scheme, but aren't uniformly
 // better either.
 
+// Higher radix multipliers pre-compute partial products for groups of bits:
+// radix-4 are groups of 2 bits, radix-8 are groups of 3 bits, and radix-16 are
+// groups of 4 bits. Performance data for these variants combined with different
+// (tree) reduction schemes are recorded at
+// https://tinyurl.com/multiplier-comparison. The data suggests that radix-8
+// with Dadda's reduction yields the most consistent performance improvement
+// while not regressing substantially in the matrix of different benchmarks and
+// CaDiCaL and MiniSat2 as solvers.
+#define RADIX_MULTIPLIER 8
+#ifdef RADIX_MULTIPLIER
+#  define DADDA_TREE
+#endif
+
+#ifdef RADIX_MULTIPLIER
+static bvt unsigned_multiply_by_3(propt &prop, const bvt &op)
+{
+  PRECONDITION(prop.cnf_handled_well());
+  PRECONDITION(!op.empty());
+
+  bvt result;
+  result.reserve(op.size());
+
+  result.push_back(op[0]);
+  literalt prev_bit = const_literal(false);
+
+  for(std::size_t i = 1; i < op.size(); ++i)
+  {
+    literalt sum = prop.new_variable();
+
+    prop.lcnf({sum, !op[i - 1], !op[i], !prev_bit});
+    prop.lcnf({sum, !op[i - 1], !op[i], result.back()});
+    prop.lcnf({sum, op[i - 1], op[i], !prev_bit, result.back()});
+    prop.lcnf({sum, !op[i - 1], op[i], prev_bit, !result.back()});
+    prop.lcnf({sum, op[i - 1], !op[i], !result.back()});
+    prop.lcnf({sum, op[i - 1], !op[i], prev_bit});
+
+    prop.lcnf({!sum, !op[i - 1], op[i], !prev_bit});
+    prop.lcnf({!sum, !op[i - 1], op[i], result.back()});
+    prop.lcnf({!sum, !op[i - 1], !op[i], prev_bit, !result.back()});
+
+    prop.lcnf({!sum, op[i - 1], op[i], !result.back()});
+    prop.lcnf({!sum, op[i - 1], op[i], prev_bit});
+    prop.lcnf({!sum, op[i - 1], !op[i], !prev_bit, result.back()});
+
+    prop.lcnf({!sum, op[i], prev_bit, result.back()});
+    prop.lcnf({!sum, op[i], !prev_bit, !result.back()});
+
+    result.push_back(sum);
+    prev_bit = op[i - 1];
+  }
+
+  return result;
+}
+#endif
+
 bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
 {
-  bvt op0=_op0, op1=_op1;
+  PRECONDITION(!_op0.empty());
+  PRECONDITION(!_op1.empty());
 
-  if(is_constant(op1))
-    std::swap(op0, op1);
+  if(_op1.size() == 1)
+  {
+    bvt product;
+    product.reserve(_op0.size());
+    for(const auto &lit : _op0)
+      product.push_back(prop.land(lit, _op1.front()));
+    return product;
+  }
 
-  // build the usual quadratic number of partial products
+  // store partial products
   std::vector<bvt> pps;
-  pps.reserve(op0.size());
+  pps.reserve(_op0.size());
+
+  bvt op0 = _op0, op1 = _op1;
+
+#ifndef RADIX_MULTIPLIER
+  if(is_constant(op1))
+    std::swap(op0, op1);
 
   for(std::size_t bit=0; bit<op0.size(); bit++)
   {
@@ -942,6 +1011,804 @@ bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
 
     pps.push_back(pp);
   }
+#else
+#  if RADIX_MULTIPLIER == 4
+#    define RADIX_GROUP_SIZE 2
+#  elif RADIX_MULTIPLIER == 8
+#    define RADIX_GROUP_SIZE 3
+#  elif RADIX_MULTIPLIER == 16
+#    define RADIX_GROUP_SIZE 4
+#  else
+#    error Unsupported radix
+#  endif
+  if(is_constant(op0) && !is_constant(op1))
+    std::swap(op0, op1);
+
+  std::optional<bvt> times_three_opt;
+  auto times_three = [this, &times_three_opt, &op0]() -> const bvt &
+  {
+    if(!times_three_opt.has_value())
+    {
+#  if 1
+      if(prop.cnf_handled_well())
+        times_three_opt = unsigned_multiply_by_3(prop, op0);
+      else
+#  endif
+        times_three_opt = add(op0, shift(op0, shiftt::SHIFT_LEFT, 1));
+    }
+    return *times_three_opt;
+  };
+
+#  if RADIX_MULTIPLIER >= 8
+  std::optional<bvt> times_five_opt, times_seven_opt;
+  auto times_five = [this, &times_five_opt, &op0]() -> const bvt &
+  {
+    if(!times_five_opt.has_value())
+      times_five_opt = add(op0, shift(op0, shiftt::SHIFT_LEFT, 2));
+    return *times_five_opt;
+  };
+  auto times_seven =
+    [this, &times_seven_opt, &op0, &times_three]() -> const bvt &
+  {
+    if(!times_seven_opt.has_value())
+      times_seven_opt = add(times_three(), shift(op0, shiftt::SHIFT_LEFT, 2));
+    return *times_seven_opt;
+  };
+#  endif
+
+#  if RADIX_MULTIPLIER == 16
+  std::optional<bvt> times_nine_opt, times_eleven_opt, times_thirteen_opt,
+    times_fifteen_opt;
+  auto times_nine = [this, &times_nine_opt, &op0]() -> const bvt &
+  {
+    if(!times_nine_opt.has_value())
+      times_nine_opt = add(op0, shift(op0, shiftt::SHIFT_LEFT, 3));
+    return *times_nine_opt;
+  };
+  auto times_eleven =
+    [this, &times_eleven_opt, &op0, &times_three]() -> const bvt &
+  {
+    if(!times_eleven_opt.has_value())
+      times_eleven_opt = add(times_three(), shift(op0, shiftt::SHIFT_LEFT, 3));
+    return *times_eleven_opt;
+  };
+  auto times_thirteen =
+    [this, &times_thirteen_opt, &op0, &times_five]() -> const bvt &
+  {
+    if(!times_thirteen_opt.has_value())
+      times_thirteen_opt = add(times_five(), shift(op0, shiftt::SHIFT_LEFT, 3));
+    return *times_thirteen_opt;
+  };
+  auto times_fifteen =
+    [this, &times_fifteen_opt, &op0, &times_seven]() -> const bvt &
+  {
+    if(!times_fifteen_opt.has_value())
+      times_fifteen_opt = add(times_seven(), shift(op0, shiftt::SHIFT_LEFT, 3));
+    return *times_fifteen_opt;
+  };
+#  endif
+
+  for(std::size_t op1_idx = 0; op1_idx + RADIX_GROUP_SIZE - 1 < op1.size();
+      op1_idx += RADIX_GROUP_SIZE)
+  {
+    const literalt &bit0 = op1[op1_idx];
+    const literalt &bit1 = op1[op1_idx + 1];
+#  if RADIX_MULTIPLIER >= 8
+    const literalt &bit2 = op1[op1_idx + 2];
+#    if RADIX_MULTIPLIER == 16
+    const literalt &bit3 = op1[op1_idx + 3];
+#    endif
+#  endif
+    bvt partial_sum;
+
+    if(
+      bit0.is_constant() && bit1.is_constant()
+#  if RADIX_MULTIPLIER >= 8
+      && bit2.is_constant()
+#    if RADIX_MULTIPLIER == 16
+      && bit3.is_constant()
+#    endif
+#  endif
+    )
+    {
+      if(bit0.is_false()) // *0
+      {
+        if(bit1.is_false()) // *00
+        {
+#  if RADIX_MULTIPLIER >= 8
+          if(bit2.is_false()) // *000
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0000
+              continue;
+            else // 1000
+              partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 3);
+#    else
+            continue;
+#    endif
+          }
+          else // *100
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0100
+              partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 2);
+            else // 1100
+              partial_sum =
+                shift(times_three(), shiftt::SHIFT_LEFT, op1_idx + 2);
+#    else
+            partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 2);
+#    endif
+          }
+#  else
+          continue;
+#  endif
+        }
+        else // *10
+        {
+#  if RADIX_MULTIPLIER >= 8
+          if(bit2.is_false()) // *010
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0010
+              partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 1);
+            else // 1010
+              partial_sum =
+                shift(times_five(), shiftt::SHIFT_LEFT, op1_idx + 1);
+#    else
+            partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 1);
+#    endif
+          }
+          else // *110
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0110
+              partial_sum =
+                shift(times_three(), shiftt::SHIFT_LEFT, op1_idx + 1);
+            else // 1110
+              partial_sum =
+                shift(times_seven(), shiftt::SHIFT_LEFT, op1_idx + 1);
+#    else
+            partial_sum = shift(times_three(), shiftt::SHIFT_LEFT, op1_idx + 1);
+#    endif
+          }
+#  else
+          partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx + 1);
+#  endif
+        }
+      }
+      else // *1
+      {
+        if(bit1.is_false()) // *01
+        {
+#  if RADIX_MULTIPLIER >= 8
+          if(bit2.is_false()) // *001
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0001
+              partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx);
+            else // 1001
+              partial_sum = shift(times_nine(), shiftt::SHIFT_LEFT, op1_idx);
+#    else
+            partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx);
+#    endif
+          }
+          else // *101
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0101
+              partial_sum = shift(times_five(), shiftt::SHIFT_LEFT, op1_idx);
+            else // 1101
+              partial_sum =
+                shift(times_thirteen(), shiftt::SHIFT_LEFT, op1_idx);
+#    else
+            partial_sum = shift(times_five(), shiftt::SHIFT_LEFT, op1_idx);
+#    endif
+          }
+#  else
+          partial_sum = shift(op0, shiftt::SHIFT_LEFT, op1_idx);
+#  endif
+        }
+        else // *11
+        {
+#  if RADIX_MULTIPLIER >= 8
+          if(bit2.is_false()) // *011
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0011
+              partial_sum = shift(times_three(), shiftt::SHIFT_LEFT, op1_idx);
+            else // 1011
+              partial_sum = shift(times_eleven(), shiftt::SHIFT_LEFT, op1_idx);
+#    else
+            partial_sum = shift(times_three(), shiftt::SHIFT_LEFT, op1_idx);
+#    endif
+          }
+          else // *111
+          {
+#    if RADIX_MULTIPLIER == 16
+            if(bit3.is_false()) // 0111
+              partial_sum = shift(times_seven(), shiftt::SHIFT_LEFT, op1_idx);
+            else // 1111
+              partial_sum = shift(times_fifteen(), shiftt::SHIFT_LEFT, op1_idx);
+#    else
+            partial_sum = shift(times_seven(), shiftt::SHIFT_LEFT, op1_idx);
+#    endif
+          }
+#  else
+          partial_sum = shift(times_three(), shiftt::SHIFT_LEFT, op1_idx);
+#  endif
+        }
+      }
+    }
+    else
+    {
+      partial_sum = bvt(op1_idx, const_literal(false));
+      for(std::size_t op0_idx = 0; op0_idx + op1_idx < op0.size(); ++op0_idx)
+      {
+#  if RADIX_MULTIPLIER == 4
+        if(prop.cnf_handled_well())
+        {
+          literalt partial_sum_bit = prop.new_variable();
+          partial_sum.push_back(partial_sum_bit);
+
+          // 00
+          prop.lcnf({bit0, bit1, !partial_sum_bit});
+          // 01 -> sum = _op0
+          prop.lcnf({!bit0, bit1, !partial_sum_bit, _op0[op0_idx]});
+          prop.lcnf({!bit0, bit1, partial_sum_bit, !_op0[op0_idx]});
+          // 10 -> sum = (_op0 << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, !partial_sum_bit});
+          else
+          {
+            prop.lcnf({bit0, !bit1, !partial_sum_bit, _op0[op0_idx - 1]});
+            prop.lcnf({bit0, !bit1, partial_sum_bit, !_op0[op0_idx - 1]});
+          }
+          // 11 -> sum = times_three
+          prop.lcnf({!bit0, !bit1, !partial_sum_bit, times_three()[op0_idx]});
+          prop.lcnf({!bit0, !bit1, partial_sum_bit, !times_three()[op0_idx]});
+        }
+        else
+        {
+          partial_sum.push_back(prop.lselect(
+            !bit1,
+            prop.land(bit0, op0[op0_idx]), // 0x
+            prop.lselect(                  // 1x
+              !bit0,
+              op0_idx == 0 ? const_literal(false) : op0[op0_idx - 1],
+              times_three()[op0_idx])));
+        }
+#  elif RADIX_MULTIPLIER == 8
+        if(prop.cnf_handled_well())
+        {
+          literalt partial_sum_bit = prop.new_variable();
+          partial_sum.push_back(partial_sum_bit);
+
+          // 000
+          prop.lcnf({bit0, bit1, bit2, !partial_sum_bit});
+          // 001 -> sum = _op0
+          prop.lcnf({!bit0, bit1, bit2, !partial_sum_bit, _op0[op0_idx]});
+          prop.lcnf({!bit0, bit1, bit2, partial_sum_bit, !_op0[op0_idx]});
+          // 010 -> sum = (_op0 << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, bit2, !partial_sum_bit});
+          else
+          {
+            prop.lcnf({bit0, !bit1, bit2, !partial_sum_bit, _op0[op0_idx - 1]});
+            prop.lcnf({bit0, !bit1, bit2, partial_sum_bit, !_op0[op0_idx - 1]});
+          }
+          // 011 -> sum = times_three
+          prop.lcnf(
+            {!bit0, !bit1, bit2, !partial_sum_bit, times_three()[op0_idx]});
+          prop.lcnf(
+            {!bit0, !bit1, bit2, partial_sum_bit, !times_three()[op0_idx]});
+          // 100 -> sum = (_op0 << 2)
+          if(op0_idx == 0 || op0_idx == 1)
+            prop.lcnf({bit0, bit1, !bit2, !partial_sum_bit});
+          else
+          {
+            prop.lcnf({bit0, bit1, !bit2, !partial_sum_bit, _op0[op0_idx - 2]});
+            prop.lcnf({bit0, bit1, !bit2, partial_sum_bit, !_op0[op0_idx - 2]});
+          }
+          // 101 -> sum = times_five
+          prop.lcnf(
+            {!bit0, bit1, !bit2, !partial_sum_bit, times_five()[op0_idx]});
+          prop.lcnf(
+            {!bit0, bit1, !bit2, partial_sum_bit, !times_five()[op0_idx]});
+          // 110 -> sum = (times_three << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, !bit2, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               !partial_sum_bit,
+               times_three()[op0_idx - 1]});
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               partial_sum_bit,
+               !times_three()[op0_idx - 1]});
+          }
+          // 111 -> sum = times_seven
+          prop.lcnf(
+            {!bit0, !bit1, !bit2, !partial_sum_bit, times_seven()[op0_idx]});
+          prop.lcnf(
+            {!bit0, !bit1, !bit2, partial_sum_bit, !times_seven()[op0_idx]});
+        }
+        else
+        {
+          partial_sum.push_back(prop.lselect(
+            !bit2,
+            prop.lselect( // 0*
+              !bit1,
+              prop.land(bit0, op0[op0_idx]), // 00x
+              prop.lselect(                  // 01x
+                !bit0,
+                op0_idx == 0 ? const_literal(false) : op0[op0_idx - 1],
+                times_three()[op0_idx])),
+            prop.lselect( // 1*
+              !bit1,
+              prop.lselect( // 10x
+                !bit0,
+                op0_idx <= 1 ? const_literal(false) : op0[op0_idx - 2],
+                times_five()[op0_idx]),
+              prop.lselect( // 11x
+                !bit0,
+                op0_idx == 0 ? const_literal(false)
+                             : times_three()[op0_idx - 1],
+                times_seven()[op0_idx]))));
+        }
+#  elif RADIX_MULTIPLIER == 16
+        if(prop.cnf_handled_well())
+        {
+          literalt partial_sum_bit = prop.new_variable();
+          partial_sum.push_back(partial_sum_bit);
+
+          // 0000
+          prop.lcnf({bit0, bit1, bit2, bit3, !partial_sum_bit});
+          // 0001 -> sum = op0
+          prop.lcnf({!bit0, bit1, bit2, bit3, !partial_sum_bit, op0[op0_idx]});
+          prop.lcnf({!bit0, bit1, bit2, bit3, partial_sum_bit, !op0[op0_idx]});
+          // 0010 -> sum = (op0 << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, bit2, bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0, !bit1, bit2, bit3, !partial_sum_bit, op0[op0_idx - 1]});
+            prop.lcnf(
+              {bit0, !bit1, bit2, bit3, partial_sum_bit, !op0[op0_idx - 1]});
+          }
+          // 0011 -> sum = times_three
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             bit2,
+             bit3,
+             !partial_sum_bit,
+             times_three()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             bit2,
+             bit3,
+             partial_sum_bit,
+             !times_three()[op0_idx]});
+          // 0100 -> sum = (op0 << 2)
+          if(op0_idx == 0 || op0_idx == 1)
+            prop.lcnf({bit0, bit1, !bit2, bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0, bit1, !bit2, bit3, !partial_sum_bit, op0[op0_idx - 2]});
+            prop.lcnf(
+              {bit0, bit1, !bit2, bit3, partial_sum_bit, !op0[op0_idx - 2]});
+          }
+          // 0101 -> sum = times_five
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             !bit2,
+             bit3,
+             !partial_sum_bit,
+             times_five()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             !bit2,
+             bit3,
+             partial_sum_bit,
+             !times_five()[op0_idx]});
+          // 0110 -> sum = (times_three << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, !bit2, bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               bit3,
+               !partial_sum_bit,
+               times_three()[op0_idx - 1]});
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               bit3,
+               partial_sum_bit,
+               !times_three()[op0_idx - 1]});
+          }
+          // 0111 -> sum = times_seven
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             !bit2,
+             bit3,
+             !partial_sum_bit,
+             times_seven()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             !bit2,
+             bit3,
+             partial_sum_bit,
+             !times_seven()[op0_idx]});
+
+          // 1000 -> sum = (op0 << 3)
+          if(op0_idx == 0 || op0_idx == 1 || op0_idx == 2)
+            prop.lcnf({bit0, bit1, bit2, !bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0, bit1, bit2, !bit3, !partial_sum_bit, op0[op0_idx - 3]});
+            prop.lcnf(
+              {bit0, bit1, bit2, !bit3, partial_sum_bit, !op0[op0_idx - 3]});
+          }
+          // 1001 -> sum = times_nine
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             bit2,
+             !bit3,
+             !partial_sum_bit,
+             times_nine()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             bit2,
+             !bit3,
+             partial_sum_bit,
+             !times_nine()[op0_idx]});
+          // 1010 -> sum = (times_five << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, bit2, !bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               bit2,
+               !bit3,
+               !partial_sum_bit,
+               times_five()[op0_idx - 1]});
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               bit2,
+               !bit3,
+               partial_sum_bit,
+               !times_five()[op0_idx - 1]});
+          }
+          // 1011 -> sum = times_eleven
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             bit2,
+             !bit3,
+             !partial_sum_bit,
+             times_eleven()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             bit2,
+             !bit3,
+             partial_sum_bit,
+             !times_eleven()[op0_idx]});
+          // 1100 -> sum = (times_three << 2)
+          if(op0_idx == 0 || op0_idx == 1)
+            prop.lcnf({bit0, bit1, !bit2, !bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0,
+               bit1,
+               !bit2,
+               !bit3,
+               !partial_sum_bit,
+               times_three()[op0_idx - 2]});
+            prop.lcnf(
+              {bit0,
+               bit1,
+               !bit2,
+               !bit3,
+               partial_sum_bit,
+               !times_three()[op0_idx - 2]});
+          }
+          // 1101 -> sum = times_thirteen
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             !bit2,
+             !bit3,
+             !partial_sum_bit,
+             times_thirteen()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             bit1,
+             !bit2,
+             !bit3,
+             partial_sum_bit,
+             !times_thirteen()[op0_idx]});
+          // 1110 -> sum = (times_seven << 1)
+          if(op0_idx == 0)
+            prop.lcnf({bit0, !bit1, !bit2, !bit3, !partial_sum_bit});
+          else
+          {
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               !bit3,
+               !partial_sum_bit,
+               times_seven()[op0_idx - 1]});
+            prop.lcnf(
+              {bit0,
+               !bit1,
+               !bit2,
+               !bit3,
+               partial_sum_bit,
+               !times_seven()[op0_idx - 1]});
+          }
+          // 1111 -> sum = times_fifteen
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             !bit2,
+             !bit3,
+             !partial_sum_bit,
+             times_fifteen()[op0_idx]});
+          prop.lcnf(
+            {!bit0,
+             !bit1,
+             !bit2,
+             !bit3,
+             partial_sum_bit,
+             !times_fifteen()[op0_idx]});
+        }
+        else
+        {
+          partial_sum.push_back(prop.lselect(
+            !bit3,
+            prop.lselect( // 0*
+              !bit2,
+              prop.lselect( // 00*
+                !bit1,
+                prop.land(bit0, op0[op0_idx]), // 000x
+                prop.lselect(                  // 001x
+                  !bit0,
+                  op0_idx == 0 ? const_literal(false) : op0[op0_idx - 1],
+                  times_three()[op0_idx])),
+              prop.lselect( // 01*
+                !bit1,
+                prop.lselect( // 010x
+                  !bit0,
+                  op0_idx <= 1 ? const_literal(false) : op0[op0_idx - 2],
+                  times_five()[op0_idx]),
+                prop.lselect( // 011x
+                  !bit0,
+                  op0_idx == 0 ? const_literal(false)
+                               : times_three()[op0_idx - 1],
+                  times_seven()[op0_idx]))),
+            prop.lselect( // 1*
+              !bit2,
+              prop.lselect( // 10*
+                !bit1,
+                prop.lselect( // 100x
+                  !bit0,
+                  op0_idx <= 2 ? const_literal(false) : op0[op0_idx - 3],
+                  times_nine()[op0_idx]),
+                prop.lselect( // 101x
+                  !bit0,
+                  op0_idx == 0 ? const_literal(false)
+                               : times_five()[op0_idx - 1],
+                  times_eleven()[op0_idx])),
+              prop.lselect( // 11*
+                !bit1,
+                prop.lselect( // 110x
+                  !bit0,
+                  op0_idx <= 1 ? const_literal(false)
+                               : times_three()[op0_idx - 2],
+                  times_thirteen()[op0_idx]),
+                prop.lselect( // 111x
+                  !bit0,
+                  op0_idx == 0 ? const_literal(false)
+                               : times_seven()[op0_idx - 1],
+                  times_fifteen()[op0_idx])))));
+        }
+#  else
+#    error Unsupported radix
+#  endif
+      }
+    }
+
+    pps.push_back(std::move(partial_sum));
+  }
+
+  if(op1.size() % RADIX_GROUP_SIZE == 1)
+  {
+    if(op0.size() == op1.size())
+    {
+      if(pps.empty())
+        pps.push_back(bvt(op0.size(), const_literal(false)));
+
+      // This is the partial product of the MSB of op1 with op0, which is all
+      // zeros except for (possibly) the MSB. Since we don't need to account for
+      // any carry out of adding this partial product, we just need to compute
+      // the sum the MSB of one of the partial products and this partial
+      // product, we is an xor of just those bits.
+      pps.back().back() =
+        prop.lxor(pps.back().back(), prop.land(op0[0], op1.back()));
+    }
+    else
+    {
+      bvt partial_sum = bvt(op1.size() - 1, const_literal(false));
+      for(const auto &lit : op0)
+      {
+        partial_sum.push_back(prop.land(lit, op1.back()));
+        if(partial_sum.size() == op0.size())
+          break;
+      }
+      pps.push_back(std::move(partial_sum));
+    }
+  }
+#  if RADIX_MULTIPLIER >= 8
+  else if(op1.size() % RADIX_GROUP_SIZE == 2)
+  {
+    const literalt &bit0 = op1[op1.size() - 2];
+    const literalt &bit1 = op1[op1.size() - 1];
+
+    bvt partial_sum = bvt(op1.size() - 2, const_literal(false));
+    for(std::size_t op0_idx = 0; op0_idx < 2; ++op0_idx)
+    {
+      if(prop.cnf_handled_well())
+      {
+        literalt partial_sum_bit = prop.new_variable();
+        partial_sum.push_back(partial_sum_bit);
+        // 00
+        prop.lcnf({bit0, bit1, !partial_sum_bit});
+        // 01 -> sum = op0
+        prop.lcnf({!bit0, bit1, !partial_sum_bit, op0[op0_idx]});
+        prop.lcnf({!bit0, bit1, partial_sum_bit, !op0[op0_idx]});
+        // 10 -> sum = (op0 << 1)
+        if(op0_idx == 0)
+          prop.lcnf({bit0, !bit1, !partial_sum_bit});
+        else
+        {
+          prop.lcnf({bit0, !bit1, !partial_sum_bit, op0[op0_idx - 1]});
+          prop.lcnf({bit0, !bit1, partial_sum_bit, !op0[op0_idx - 1]});
+        }
+        // 11 -> sum = times_three
+        prop.lcnf({!bit0, !bit1, !partial_sum_bit, times_three()[op0_idx]});
+        prop.lcnf({!bit0, !bit1, partial_sum_bit, !times_three()[op0_idx]});
+      }
+      else
+      {
+        partial_sum.push_back(prop.lselect(
+          !bit1,
+          prop.land(bit0, op0[op0_idx]), // 0x
+          prop.lselect(                  // 1x
+            !bit0,
+            op0_idx == 0 ? const_literal(false) : op0[op0_idx - 1],
+            times_three()[op0_idx])));
+      }
+    }
+
+    pps.push_back(std::move(partial_sum));
+  }
+#  endif
+#  if RADIX_MULTIPLIER == 16
+  else if(op1.size() % RADIX_GROUP_SIZE == 3)
+  {
+    const literalt &bit0 = op1[op1.size() - 3];
+    const literalt &bit1 = op1[op1.size() - 2];
+    const literalt &bit2 = op1[op1.size() - 1];
+
+    bvt partial_sum = bvt(op1.size() - 3, const_literal(false));
+    for(std::size_t op0_idx = 0; op0_idx < 3; ++op0_idx)
+    {
+      if(prop.cnf_handled_well())
+      {
+        literalt partial_sum_bit = prop.new_variable();
+        partial_sum.push_back(partial_sum_bit);
+        // 000
+        prop.lcnf({bit0, bit1, bit2, !partial_sum_bit});
+        // 001 -> sum = op0
+        prop.lcnf({!bit0, bit1, bit2, !partial_sum_bit, op0[op0_idx]});
+        prop.lcnf({!bit0, bit1, bit2, partial_sum_bit, !op0[op0_idx]});
+        // 010 -> sum = (op0 << 1)
+        if(op0_idx == 0)
+          prop.lcnf({bit0, !bit1, bit2, !partial_sum_bit});
+        else
+        {
+          prop.lcnf({bit0, !bit1, bit2, !partial_sum_bit, op0[op0_idx - 1]});
+          prop.lcnf({bit0, !bit1, bit2, partial_sum_bit, !op0[op0_idx - 1]});
+        }
+        // 011 -> sum = times_three
+        prop.lcnf(
+          {!bit0, !bit1, bit2, !partial_sum_bit, times_three()[op0_idx]});
+        prop.lcnf(
+          {!bit0, !bit1, bit2, partial_sum_bit, !times_three()[op0_idx]});
+        // 100 -> sum = (op0 << 2)
+        if(op0_idx == 0 || op0_idx == 1)
+          prop.lcnf({bit0, bit1, !bit2, !partial_sum_bit});
+        else
+        {
+          prop.lcnf({bit0, bit1, !bit2, !partial_sum_bit, op0[op0_idx - 2]});
+          prop.lcnf({bit0, bit1, !bit2, partial_sum_bit, !op0[op0_idx - 2]});
+        }
+        // 101 -> sum = times_five
+        prop.lcnf(
+          {!bit0, bit1, !bit2, !partial_sum_bit, times_five()[op0_idx]});
+        prop.lcnf(
+          {!bit0, bit1, !bit2, partial_sum_bit, !times_five()[op0_idx]});
+        // 110 -> sum = (times_three << 1)
+        if(op0_idx == 0)
+          prop.lcnf({bit0, !bit1, !bit2, !partial_sum_bit});
+        else
+        {
+          prop.lcnf(
+            {bit0, !bit1, !bit2, !partial_sum_bit, times_three()[op0_idx - 1]});
+          prop.lcnf(
+            {bit0, !bit1, !bit2, partial_sum_bit, !times_three()[op0_idx - 1]});
+        }
+        // 111 -> sum = times_seven
+        prop.lcnf(
+          {!bit0, !bit1, !bit2, !partial_sum_bit, times_seven()[op0_idx]});
+        prop.lcnf(
+          {!bit0, !bit1, !bit2, partial_sum_bit, !times_seven()[op0_idx]});
+      }
+      else
+      {
+        partial_sum.push_back(prop.lselect(
+          !bit2,
+          prop.lselect( // 0*
+            !bit1,
+            prop.land(bit0, op0[op0_idx]), // 00x
+            prop.lselect(                  // 01x
+              !bit0,
+              op0_idx == 0 ? const_literal(false) : op0[op0_idx - 1],
+              times_three()[op0_idx])),
+          prop.lselect( // 1*
+            !bit1,
+            prop.lselect( // 10x
+              !bit0,
+              op0_idx <= 1 ? const_literal(false) : op0[op0_idx - 2],
+              times_five()[op0_idx]),
+            prop.lselect( // 11x
+              !bit0,
+              op0_idx == 0 ? const_literal(false) : times_three()[op0_idx - 1],
+              times_seven()[op0_idx]))));
+      }
+    }
+
+    pps.push_back(std::move(partial_sum));
+  }
+#  endif
+#endif
 
   if(pps.empty())
     return zeros(op0.size());

From 12a37e7c64cdc8b8e75d8c9965ea3229e64f4fce Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Tue, 25 Jan 2022 14:17:22 +0000
Subject: [PATCH 05/13] Implement Karatsuba's multiplication algorithm

Uses extra sign-bit to keep bit widths small.
---
 src/solvers/flattening/bv_utils.cpp | 131 +++++++++++++++++++++++++++-
 src/solvers/flattening/bv_utils.h   |   2 +
 2 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index 66801a0cfc9..21bcc2aaa65 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -926,7 +926,8 @@ bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
 // with Dadda's reduction yields the most consistent performance improvement
 // while not regressing substantially in the matrix of different benchmarks and
 // CaDiCaL and MiniSat2 as solvers.
-#define RADIX_MULTIPLIER 8
+// #define RADIX_MULTIPLIER 8
+#define USE_KARATSUBA
 #ifdef RADIX_MULTIPLIER
 #  define DADDA_TREE
 #endif
@@ -1829,6 +1830,125 @@ bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
   }
 }
 
+bvt bv_utilst::unsigned_karatsuba_full_multiplier(
+  const bvt &op0,
+  const bvt &op1)
+{
+  // We review symbolic encoding of multiplication in context of sw
+  // verification, bit width is 2^n, distinguish truncating (x mod 2^2^n) from
+  // double-output-width multiplication, truncating Karatsuba is 2 truncating
+  // half-width multiplication plus one double-output-width of half width, for
+  // double output width Karatsuba idea is challenge to avoid width extension,
+  // check Wikipedia edit history
+
+  PRECONDITION(op0.size() == op1.size());
+  const std::size_t op_size = op0.size();
+  PRECONDITION(op_size > 0);
+  PRECONDITION((op_size & (op_size - 1)) == 0);
+
+  if(op_size == 1)
+    return {prop.land(op0[0], op1[0]), const_literal(false)};
+
+  const std::size_t half_op_size = op_size >> 1;
+
+  bvt x0{op0.begin(), op0.begin() + half_op_size};
+  bvt x1{op0.begin() + half_op_size, op0.end()};
+
+  bvt y0{op1.begin(), op1.begin() + half_op_size};
+  bvt y1{op1.begin() + half_op_size, op1.end()};
+
+  bvt z0 = unsigned_karatsuba_full_multiplier(x0, y0);
+  bvt z2 = unsigned_karatsuba_full_multiplier(x1, y1);
+
+  bvt x0_sub = zero_extension(x0, half_op_size + 1);
+  bvt x1_sub = zero_extension(x1, half_op_size + 1);
+
+  bvt y0_sub = zero_extension(y0, half_op_size + 1);
+  bvt y1_sub = zero_extension(y1, half_op_size + 1);
+
+  bvt x1_minus_x0_ext = sub(x1_sub, x0_sub);
+  literalt x1_minus_x0_sign = sign_bit(x1_minus_x0_ext);
+  bvt x1_minus_x0_abs = absolute_value(x1_minus_x0_ext);
+  x1_minus_x0_abs.pop_back();
+  bvt y0_minus_y1_ext = sub(y0_sub, y1_sub);
+  literalt y0_minus_y1_sign = sign_bit(y0_minus_y1_ext);
+  bvt y0_minus_y1_abs = absolute_value(y0_minus_y1_ext);
+  y0_minus_y1_abs.pop_back();
+  bvt sub_mult =
+    unsigned_karatsuba_full_multiplier(x1_minus_x0_abs, y0_minus_y1_abs);
+  bvt sub_mult_ext = zero_extension(sub_mult, op_size + 1);
+  bvt z1_ext = add_sub(
+    add(zero_extension(z0, op_size + 1), zero_extension(z2, op_size + 1)),
+    sub_mult_ext,
+    prop.lxor(x1_minus_x0_sign, y0_minus_y1_sign));
+
+  bvt z0_full = zero_extension(z0, op_size << 1);
+  bvt z1_full =
+    zero_extension(concatenate(zeros(half_op_size), z1_ext), op_size << 1);
+  bvt z2_full = concatenate(zeros(op_size), z2);
+
+  return add(add(z0_full, z1_full), z2_full);
+}
+
+bvt bv_utilst::unsigned_karatsuba_multiplier(const bvt &_op0, const bvt &_op1)
+{
+  if(_op0.size() != _op1.size())
+    return unsigned_multiplier(_op0, _op1);
+
+  const std::size_t op_size = _op0.size();
+  // only use this approach for powers of two
+  if(op_size == 0 || (op_size & (op_size - 1)) != 0)
+    return unsigned_multiplier(_op0, _op1);
+
+  if(op_size == 1)
+    return {prop.land(_op0[0], _op1[0])};
+
+  const std::size_t half_op_size = op_size >> 1;
+
+  // We split each of the operands in half and treat them as coefficients of a
+  // polynomial a * 2^half_op_size + b. Straightforward polynomial
+  // multiplication then yields
+  // a0 * a1 * 2^op_size + (a0 * b1 + a1 * b0) * 2^half_op_size + b0 * b1
+  // These would be four multiplications (the operands of which have half the
+  // original bit width):
+  // z0 = b0 * b1
+  // z1 = a0 * b1 + a1 * b0
+  // z2 = a0 * a1
+  // Karatsuba's insight is that these four multiplications can be expressed
+  // using just three multiplications:
+  // z1 = (a0 - b0) * (b1 - a1) + z0 + z2
+  //
+  // Worked 4-bit example, 4-bit result:
+  // abcd * efgh -> 4-bit result
+  // cd * gh -> 4-bit result
+  // cd * ef -> 2-bit result
+  // ab * gh -> 2-bit result
+  // d * h -> 2-bit result
+  // c * g -> 2-bit result
+  // (c - d) * (h - g) + dh + cg; use an extra sign bit for each of the
+  // subtractions, and conditionally negate the product by xor-ing those sign
+  // bits; dh + cg is a 2-bit addition (with possible results 0, 1, 2); the
+  // product has possible values (-1, 0, 1); the final sum cannot evaluate to -1
+  // as
+  // * c=1, d=0, h=0, g=1 (1 * -1) implies cg=1
+  // * c=0, d=1, h=1, g=0 (-1 * 1) implies dh=1
+  // Therefore, after adding (dh + cg) the multiplication can safely be added
+  // over just 2 bits.
+
+  bvt x0{_op0.begin(), _op0.begin() + half_op_size};
+  bvt x1{_op0.begin() + half_op_size, _op0.end()};
+  bvt y0{_op1.begin(), _op1.begin() + half_op_size};
+  bvt y1{_op1.begin() + half_op_size, _op1.end()};
+
+  bvt z0 = unsigned_karatsuba_full_multiplier(x0, y0);
+  bvt z1 = add(
+    unsigned_karatsuba_multiplier(x1, y0),
+    unsigned_karatsuba_multiplier(x0, y1));
+  bvt z1_full = concatenate(zeros(half_op_size), z1);
+
+  return add(z0, z1_full);
+}
+
 bvt bv_utilst::unsigned_multiplier_no_overflow(
   const bvt &op0,
   const bvt &op1)
@@ -1879,7 +1999,11 @@ bvt bv_utilst::signed_multiplier(const bvt &op0, const bvt &op1)
   bvt neg0=cond_negate(op0, sign0);
   bvt neg1=cond_negate(op1, sign1);
 
+#ifdef USE_KARATSUBA
+  bvt result = unsigned_karatsuba_multiplier(neg0, neg1);
+#else
   bvt result=unsigned_multiplier(neg0, neg1);
+#endif
 
   literalt result_sign=prop.lxor(sign0, sign1);
 
@@ -1947,7 +2071,12 @@ bvt bv_utilst::multiplier(
   switch(rep)
   {
   case representationt::SIGNED: return signed_multiplier(op0, op1);
+#ifdef USE_KARATSUBA
+  case representationt::UNSIGNED:
+    return unsigned_karatsuba_multiplier(op0, op1);
+#else
   case representationt::UNSIGNED: return unsigned_multiplier(op0, op1);
+#endif
   }
 
   UNREACHABLE;
diff --git a/src/solvers/flattening/bv_utils.h b/src/solvers/flattening/bv_utils.h
index 60ba1422b56..fe81a4b1d80 100644
--- a/src/solvers/flattening/bv_utils.h
+++ b/src/solvers/flattening/bv_utils.h
@@ -79,6 +79,8 @@ class bv_utilst
   bvt shift(const bvt &op, const shiftt shift, const bvt &distance);
 
   bvt unsigned_multiplier(const bvt &op0, const bvt &op1);
+  bvt unsigned_karatsuba_multiplier(const bvt &op0, const bvt &op1);
+  bvt unsigned_karatsuba_full_multiplier(const bvt &op0, const bvt &op1);
   bvt signed_multiplier(const bvt &op0, const bvt &op1);
   bvt multiplier(const bvt &op0, const bvt &op1, representationt rep);
   bvt multiplier_no_overflow(

From 6a3c7d3cf4e62121e110be431179e41daef182f3 Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Wed, 15 Nov 2023 23:06:08 +0000
Subject: [PATCH 06/13] Symbolic Toom-Cook multiplication

Implements the algorithm of Section 4 of "Further Steps Down The Wrong
Path : Improving the Bit-Blasting of Multiplication" (see
https://ceur-ws.org/Vol-2908/short16.pdf).
---
 src/solvers/flattening/bv_utils.cpp | 159 +++++++++++++++++++++++++++-
 src/solvers/flattening/bv_utils.h   |   1 +
 2 files changed, 158 insertions(+), 2 deletions(-)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index 21bcc2aaa65..9b0bb2c391b 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -784,7 +784,7 @@ bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
 // trees (and also the default addition scheme), but isn't consistently more
 // performant with simple partial-product generation. Only when using
 // higher-radix multipliers the combination appears to perform better.
-#define DADDA_TREE
+// #define DADDA_TREE
 
 // The following examples demonstrate the performance differences (with a
 // time-out of 7200 seconds):
@@ -927,7 +927,8 @@ bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
 // while not regressing substantially in the matrix of different benchmarks and
 // CaDiCaL and MiniSat2 as solvers.
 // #define RADIX_MULTIPLIER 8
-#define USE_KARATSUBA
+// #define USE_KARATSUBA
+#define USE_TOOM_COOK
 #ifdef RADIX_MULTIPLIER
 #  define DADDA_TREE
 #endif
@@ -1949,6 +1950,155 @@ bvt bv_utilst::unsigned_karatsuba_multiplier(const bvt &_op0, const bvt &_op1)
   return add(z0, z1_full);
 }
 
+bvt bv_utilst::unsigned_toom_cook_multiplier(const bvt &_op0, const bvt &_op1)
+{
+  PRECONDITION(!_op0.empty());
+  PRECONDITION(!_op1.empty());
+
+  if(_op1.size() == 1)
+    return unsigned_multiplier(_op0, _op1);
+
+  // break up _op0, _op1 in groups of at most GROUP_SIZE bits
+  PRECONDITION(_op0.size() == _op1.size());
+#define GROUP_SIZE 8
+  const std::size_t d_bits =
+    2 * GROUP_SIZE +
+    2 * address_bits((_op0.size() + GROUP_SIZE - 1) / GROUP_SIZE);
+  std::vector<bvt> a, b, c_ops, d;
+  for(std::size_t i = 0; i < _op0.size(); i += GROUP_SIZE)
+  {
+    std::size_t u = std::min(i + GROUP_SIZE, _op0.size());
+    a.emplace_back(_op0.begin() + i, _op0.begin() + u);
+    b.emplace_back(_op1.begin() + i, _op1.begin() + u);
+
+    c_ops.push_back(zeros(i));
+    d.push_back(prop.new_variables(d_bits));
+    c_ops.back().insert(c_ops.back().end(), d.back().begin(), d.back().end());
+    c_ops.back() = zero_extension(c_ops.back(), _op0.size());
+  }
+  for(std::size_t i = a.size(); i < 2 * a.size() - 1; ++i)
+  {
+    d.push_back(prop.new_variables(d_bits));
+  }
+
+  // r(0)
+  bvt r_0 = d[0];
+  prop.l_set_to_true(equal(
+    r_0,
+    unsigned_multiplier(
+      zero_extension(a[0], r_0.size()), zero_extension(b[0], r_0.size()))));
+
+  for(std::size_t j = 1; j < a.size(); ++j)
+  {
+    // r(2^(j-1))
+    bvt r_j = zero_extension(
+      d[0], std::min(_op0.size(), d[0].size() + (j - 1) * (d.size() - 1)));
+    for(std::size_t i = 1; i < d.size(); ++i)
+    {
+      r_j = add(
+        r_j,
+        shift(
+          zero_extension(d[i], r_j.size()), shiftt::SHIFT_LEFT, (j - 1) * i));
+    }
+
+    bvt a_even = zero_extension(a[0], r_j.size());
+    for(std::size_t i = 2; i < a.size(); i += 2)
+    {
+      a_even = add(
+        a_even,
+        shift(
+          zero_extension(a[i], a_even.size()),
+          shiftt::SHIFT_LEFT,
+          (j - 1) * i));
+    }
+    bvt a_odd = zero_extension(a[1], r_j.size());
+    for(std::size_t i = 3; i < a.size(); i += 2)
+    {
+      a_odd = add(
+        a_odd,
+        shift(
+          zero_extension(a[i], a_odd.size()),
+          shiftt::SHIFT_LEFT,
+          (j - 1) * (i - 1)));
+    }
+    bvt b_even = zero_extension(b[0], r_j.size());
+    for(std::size_t i = 2; i < b.size(); i += 2)
+    {
+      b_even = add(
+        b_even,
+        shift(
+          zero_extension(b[i], b_even.size()),
+          shiftt::SHIFT_LEFT,
+          (j - 1) * i));
+    }
+    bvt b_odd = zero_extension(b[1], r_j.size());
+    for(std::size_t i = 3; i < b.size(); i += 2)
+    {
+      b_odd = add(
+        b_odd,
+        shift(
+          zero_extension(b[i], b_odd.size()),
+          shiftt::SHIFT_LEFT,
+          (j - 1) * (i - 1)));
+    }
+
+    prop.l_set_to_true(equal(
+      r_j,
+      unsigned_multiplier(
+        add(a_even, shift(a_odd, shiftt::SHIFT_LEFT, j - 1)),
+        add(b_even, shift(b_odd, shiftt::SHIFT_LEFT, j - 1)))));
+
+    // r(-2^(j-1))
+    bvt r_minus_j = zero_extension(
+      d[0], std::min(_op0.size(), d[0].size() + (j - 1) * (d.size() - 1)));
+    for(std::size_t i = 1; i < d.size(); ++i)
+    {
+      if(i % 2 == 1)
+      {
+        r_minus_j = sub(
+          r_minus_j,
+          shift(
+            zero_extension(d[i], r_minus_j.size()),
+            shiftt::SHIFT_LEFT,
+            (j - 1) * i));
+      }
+      else
+      {
+        r_minus_j = add(
+          r_minus_j,
+          shift(
+            zero_extension(d[i], r_minus_j.size()),
+            shiftt::SHIFT_LEFT,
+            (j - 1) * i));
+      }
+    }
+
+    prop.l_set_to_true(equal(
+      r_minus_j,
+      unsigned_multiplier(
+        sub(a_even, shift(a_odd, shiftt::SHIFT_LEFT, j - 1)),
+        sub(b_even, shift(b_odd, shiftt::SHIFT_LEFT, j - 1)))));
+  }
+
+  if(c_ops.empty())
+    return zeros(_op0.size());
+  else
+  {
+#ifdef WALLACE_TREE
+    return wallace_tree(c_ops);
+#elif defined(DADDA_TREE)
+    return dadda_tree(c_ops);
+#else
+    bvt product = c_ops.front();
+
+    for(auto it = std::next(c_ops.begin()); it != c_ops.end(); ++it)
+      product = add(product, *it);
+
+    return product;
+#endif
+  }
+}
+
 bvt bv_utilst::unsigned_multiplier_no_overflow(
   const bvt &op0,
   const bvt &op1)
@@ -2001,6 +2151,8 @@ bvt bv_utilst::signed_multiplier(const bvt &op0, const bvt &op1)
 
 #ifdef USE_KARATSUBA
   bvt result = unsigned_karatsuba_multiplier(neg0, neg1);
+#elif defined(USE_TOOM_COOK)
+  bvt result = unsigned_toom_cook_multiplier(neg0, neg1);
 #else
   bvt result=unsigned_multiplier(neg0, neg1);
 #endif
@@ -2074,6 +2226,9 @@ bvt bv_utilst::multiplier(
 #ifdef USE_KARATSUBA
   case representationt::UNSIGNED:
     return unsigned_karatsuba_multiplier(op0, op1);
+#elif defined(USE_TOOM_COOK)
+  case representationt::UNSIGNED:
+    return unsigned_toom_cook_multiplier(op0, op1);
 #else
   case representationt::UNSIGNED: return unsigned_multiplier(op0, op1);
 #endif
diff --git a/src/solvers/flattening/bv_utils.h b/src/solvers/flattening/bv_utils.h
index fe81a4b1d80..ac376f7e1d7 100644
--- a/src/solvers/flattening/bv_utils.h
+++ b/src/solvers/flattening/bv_utils.h
@@ -81,6 +81,7 @@ class bv_utilst
   bvt unsigned_multiplier(const bvt &op0, const bvt &op1);
   bvt unsigned_karatsuba_multiplier(const bvt &op0, const bvt &op1);
   bvt unsigned_karatsuba_full_multiplier(const bvt &op0, const bvt &op1);
+  bvt unsigned_toom_cook_multiplier(const bvt &op0, const bvt &op1);
   bvt signed_multiplier(const bvt &op0, const bvt &op1);
   bvt multiplier(const bvt &op0, const bvt &op1, representationt rep);
   bvt multiplier_no_overflow(

From ab97a3676f7d0e00e5325ee2222a16c97d82a24a Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Mon, 14 Jul 2025 10:38:54 +0000
Subject: [PATCH 07/13] bvt debug output helper

Prints a vector of literals in human-readable form, including a decimal
representation when all literals are constants.
---
 src/solvers/flattening/bv_utils.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index 9b0bb2c391b..ca5a6495055 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -13,6 +13,33 @@ Author: Daniel Kroening, kroening@kroening.com
 #include <list>
 #include <utility>
 
+static std::string beautify(const bvt &bv)
+{
+  for(const auto &v : bv)
+  {
+    if(!v.is_constant())
+    {
+      std::ostringstream oss;
+      oss << bv;
+      return oss.str();
+    }
+  }
+
+  std::string result;
+  std::size_t number = 0;
+  for(std::size_t i = 0; i < bv.size(); ++i)
+  {
+    if(result.size() % 5 == 4)
+      result = std::string(" ") + result;
+    result = std::string(bv[i].is_false() ? "0" : "1") + result;
+
+    if(bv[i].is_true())
+      number += 1 << i;
+  }
+
+  return result + " (" + std::to_string(number) + ")";
+}
+
 bvt bv_utilst::build_constant(const mp_integer &n, std::size_t width)
 {
   std::string n_str=integer2binary(n, width);

From dfebcc611aa7f16dff0c5a16250a953d5cb269ff Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Mon, 11 Mar 2024 16:14:06 +0000
Subject: [PATCH 08/13] =?UTF-8?q?Sch=C3=B6nhage-Strassen=20multiplication?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements a symbolic version of the algorithm proposed by Schönhage and
Strassen in "Schnelle Multiplikation großer Zahlen", Computing, 7, 1971.
---
 src/solvers/flattening/bv_utils.cpp | 386 +++++++++++++++++++++++++++-
 src/solvers/flattening/bv_utils.h   |   1 +
 2 files changed, 386 insertions(+), 1 deletion(-)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index ca5a6495055..41850dc0b1a 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -10,6 +10,8 @@ Author: Daniel Kroening, kroening@kroening.com
 
 #include <util/arith_tools.h>
 
+#include <climits>
+#include <iostream>
 #include <list>
 #include <utility>
 
@@ -955,7 +957,8 @@ bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
 // CaDiCaL and MiniSat2 as solvers.
 // #define RADIX_MULTIPLIER 8
 // #define USE_KARATSUBA
-#define USE_TOOM_COOK
+// #define USE_TOOM_COOK
+#define USE_SCHOENHAGE_STRASSEN
 #ifdef RADIX_MULTIPLIER
 #  define DADDA_TREE
 #endif
@@ -2126,6 +2129,382 @@ bvt bv_utilst::unsigned_toom_cook_multiplier(const bvt &_op0, const bvt &_op1)
   }
 }
 
+bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
+  const bvt &a,
+  const bvt &b)
+{
+  PRECONDITION(a.size() == b.size());
+
+  // Running examples: we want to multiple 213 by 15 as 8- or 9-bit integers.
+  // That is, we seek to multiply 11010101 (011010101) by 00001111 (000001111).
+  //                              ^bit 7 ^bit 0
+  // The expected result is 123 as both an 8-bit and 9-bit result (001111011).
+
+  // We compute the result modulo a Fermat number F_m = 2^2^m + 1. The maximum
+  // result when multiplying a by b (with their sizes being the same per the
+  // precondition above) is 2^2*op_size - 1.
+  // TODO: we don't actually need a full multiplier, a result with up to op_size
+  // bits is sufficient for our purposes.
+  // Hence we require 2^2^m >= 2^2*op_size, i.e., 2^m >= 2*op_size, or
+  // m >= log_2(op_size) + 1.
+  // For our examples m will be 4 and 5, respectively, with Fermat numbers
+  // 2^16 + 1 and 2^32 + 1.
+  const std::size_t m = address_bits(a.size()) + 1;
+  std::cerr << "m: " << m << std::endl;
+
+  // Extend bit width to 2^(m + 1) = op_size (rounded to next power of 2) * 4
+  // For our examples, extended bit widths will be 32 and 64.
+  PRECONDITION(sizeof(std::size_t) * CHAR_BIT > m + 1);
+  const std::size_t two_to_m_plus_1 = (std::size_t)1 << (m + 1);
+  std::cerr << "a: " << beautify(a) << std::endl;
+  std::cerr << "b: " << beautify(b) << std::endl;
+  bvt a_ext = zero_extension(a, two_to_m_plus_1);
+  bvt b_ext = zero_extension(b, two_to_m_plus_1);
+
+  // We need to distinguish whether m is even or odd
+  // m = 2n - 1 for odd m and m = 2n -2 for even m
+  // For our 8-bit inputs we have m = 4 and, therefore, n = 3.
+  // For our 9-bit inputs we have m = 5 and, therefore, n = 3.
+  const std::size_t n = m % 2 == 1 ? (m + 1) / 2 : m / 2 + 1;
+  std::cerr << "n: " << n << std::endl;
+
+  // For even m create 2^n (of 2^(n - 1) bits) chunks from a_ext, b_ext (for our
+  // 8-bit inputs we have chunk_size = 4 with num_chunks = 8).
+  // For odd m create 2^(n + 1) chunks (of 2^(n - 1) bits) from a_ext, b_ext;
+  // a_0 will be bit positions 0 through to 2^(n - 1) - 1, a_{2^(n + 1) - 1}
+  // will be bit positions up to 2^(m + 1) - 1.
+  // For our 9-bit inputs we have chunk_size = 4 with num_chunks = 16
+  const std::size_t chunk_size = (std::size_t)1 << (n - 1);
+  const std::size_t num_chunks = two_to_m_plus_1 / chunk_size;
+  CHECK_RETURN(
+    num_chunks == m % 2 ? (std::size_t)1 << (n + 1) : (std::size_t)1 << n);
+  std::cerr << "chunk_size: " << chunk_size << std::endl;
+  std::cerr << "num_chunks: " << num_chunks << std::endl;
+  std::cerr << "address_bits(num_chunks): " << address_bits(num_chunks)
+            << std::endl;
+
+  std::vector<bvt> a_rho, b_sigma;
+  a_rho.reserve(num_chunks);
+  b_sigma.reserve(num_chunks);
+  for(std::size_t i = 0; i < num_chunks; ++i)
+  {
+    a_rho.emplace_back(
+      a_ext.begin() + i * chunk_size, a_ext.begin() + (i + 1) * chunk_size);
+    b_sigma.emplace_back(
+      b_ext.begin() + i * chunk_size, b_ext.begin() + (i + 1) * chunk_size);
+  }
+  // For our example we now have
+  // a_rho = [ 0101, 1101, 0000, ..., 0000 ]
+  // b_sigma = [ 1111, 0000, 0000, ..., 0000 ]
+
+  // Compute gamma_r = \sum_{i + j = r} a_i * b_j with bit width 3n + 5 with r
+  // ranging from 0 to 2^(n + 2) - 1 (to 2^(n + 1) - 1 when m is even).
+  // For our example this will be additions/multiplications of width 14
+  // (implying that school book multiplication would be cheaper, as is the case
+  // for all operand lengths below 32 bits).
+  // TODO: all subsequent steps seem to be using mod 2^(n + 2) (mod 2^(n + 1)
+  // when m is even), so it may be sufficient to do this over n + 2 bits instead
+  // of 3n + 5.
+  std::vector<bvt> gamma_tau{num_chunks * 2, zeros(3 * n + 5)};
+  for(std::size_t tau = 0; tau < num_chunks * 2; ++tau)
+  {
+    for(std::size_t rho = tau < num_chunks ? 0 : tau - num_chunks + 1;
+        rho < num_chunks && rho <= tau;
+        ++rho)
+    {
+      const std::size_t sigma = tau - rho;
+      gamma_tau[tau] = add(
+        gamma_tau[tau],
+        unsigned_multiplier(
+          zero_extension(a_rho[rho], 3 * n + 5),
+          zero_extension(b_sigma[sigma], 3 * n + 5)));
+    }
+  }
+  // For our example we obtain
+  // gamma_tau = [ 00 0000 0100 1011, 00 0000 1100 0011, 0.... ]
+
+  // Compute c_tau over bit width n + 2 (n + 1 when m is even) as gamma_tau +
+  // gamma_{tau + 2^(n + 1)} (gamma_{tau + 2^n} when m is even).
+  std::vector<bvt> c_tau;
+  c_tau.reserve(num_chunks);
+  for(std::size_t tau = 0; tau < num_chunks; ++tau)
+  {
+    c_tau.push_back(add(gamma_tau[tau], gamma_tau[tau + num_chunks]));
+    CHECK_RETURN(c_tau.back().size() >= address_bits(num_chunks) + 1);
+    c_tau.back().resize(address_bits(num_chunks) + 1);
+    std::cerr << "c_tau[" << tau << "]: " << beautify(c_tau[tau]) << std::endl;
+  }
+  // For our example we obtain
+  // c_tau = [ 01011, 00011, 0... ]
+
+  // Compute z_j = c_j - c_{j + 2^n} (mod 2^(n + 2)) (mod 2^(n + 1) and c_{j +
+  // 2^(n - 1)} when m is even)
+  std::vector<bvt> z_j;
+  z_j.reserve(num_chunks / 2);
+  for(std::size_t j = 0; j < num_chunks / 2; ++j)
+    z_j.push_back(sub(c_tau[j], c_tau[j + num_chunks / 2]));
+  // For our example we have z_j = c_tau as all elements beyond the second one
+  // are zeros.
+
+  // Compute z_j mod F_n using number-theoretic transform with omega = 2 for
+  // odd m and omega = 4 for even m.
+  // For our examples we have F_n = 2^2^n + 1 = 257 with 2 being a 2^(n + 1)-th
+  // root of unity, i.e., 2^16 \equiv 1 (mod 257) (with 4 being a 2^n-root of
+  // unity, i.e., 4^8 \equiv 1 (mod 257). The DFT table for omega = 2 would be
+  // 1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
+  // 1   2   4   8  16  32  64 128  -1  -2  -4  -8 -16 -32 -64 -128
+  // 1   4  16  64  -1  -4 -16 -64   1   4  16  64  -1  -4 -16 -64
+  // 1   8  64  -2 -16 -128  4  32  -1  -8 -64   2  16 128  -4 -32
+  // 1  16  -1 -16   1  16  -1 -16   1  16  -1 -16   1  16  -1 -16
+  // 1  32  -4 -128 16  -2 -64   8  -1 -32   4 128 -16   2  64  -8
+  // 1  64 -16   4  -1 -64  16  -2   1  64 -16   4  -1 -64  16  -4
+  // 1 128 -64  32 -16   8  -2   2  -1 -128 64 -32  16  -8   4  -2
+  // 1  -1   1  -1   1  -1   1  -1   1  -1   1  -1   1  -1   1  -1
+  // 1  -2   4  -8  16 -32  64 -128 -1   2  -4   8 -16  32 -64 128
+  // 1  -4  16 -64  -1   4 -16  64   1  -4  16 -64  -1   4 -16  64
+  // 1  -8  64   2 -16 128   4 -32  -1   8 -64  -2  16 -128 -4  32
+  // 1 -16  -1  16   1 -16  -1  16   1 -16  -1  16   1 -16  -1  16
+  // 1 -32  -4 128  16   2 -64  -8  -1  32   4 -128 -16 -2  64   8
+  // 1 -64 -16  -4  -1  64  16   4   1 -64 -16  -4  -1  64  16   4
+  // 1 -128 -64 -32 -16 -8  -4  -2  -1 128  64  32  16   8   4   2
+  // For fast NTT (less than O(n^2)) use Cooley-Tukey for NTT, then perform
+  // element-wise multiplication, and finally apply Gentleman-Sande for
+  // inverse NTT.
+
+  // Addition mod F_n with overflow
+  auto cyclic_add = [this](const bvt &x, const bvt &y)
+  {
+    PRECONDITION(x.size() == y.size());
+
+    auto result_with_overflow = adder(x, y, const_literal(false));
+    if(result_with_overflow.second.is_false())
+      return result_with_overflow.first;
+
+    return add(
+      result_with_overflow.first,
+      zero_extension(bvt{1, result_with_overflow.second}, x.size()));
+  };
+
+  // Compute NTT
+  std::vector<bvt> a_j, b_j;
+  a_j.reserve(num_chunks);
+  b_j.reserve(num_chunks);
+  for(std::size_t j = 0; j < num_chunks; ++j)
+  {
+    // All NTT steps are mod F_n, i.e., mod 2^2^n + 1, which implies we need
+    // 2^(n + 1) bits to represent numbers
+    a_j.push_back(zero_extension(a_rho[j], (std::size_t)1 << (n + 1)));
+    b_j.push_back(zero_extension(b_sigma[j], (std::size_t)1 << (n + 1)));
+  }
+  // Use in-place iterative Cooley-Tukey
+  std::vector<bvt> Aa, Ab;
+  Aa.reserve(num_chunks);
+  Ab.reserve(num_chunks);
+  // In the following we use k represented as bits k_{n - 1}...k_0 and
+  // j_0...j_{n - 1}, i.e., the most-significant bit of k is k_{n - 1} while the
+  // MSB for j is j_0.
+  for(std::size_t k = 0; k < num_chunks; ++k)
+  {
+    // reverse n (n - 1 if m is even) bits of k
+    std::size_t j = 0;
+    for(std::size_t nu = 0; nu < address_bits(num_chunks); ++nu)
+    {
+      j <<= 1; // the initial shift has no effect
+      j |= (k & (1 << nu)) >> nu;
+    }
+    Aa.push_back(a_j[j]);
+    Ab.push_back(b_j[j]);
+  }
+  for(std::size_t nu = 1; nu <= address_bits(num_chunks); ++nu)
+  {
+    const std::size_t bit_nu = (std::size_t)1 << (nu - 1);
+    std::size_t bits_up_to_nu = 0;
+    for(std::size_t i = 0; i < nu - 1; ++i)
+      bits_up_to_nu |= 1 << i;
+
+    // we only need odd ones
+    for(std::size_t k = 1; k < num_chunks; k += 2)
+    {
+      if((k & bit_nu) == 0)
+        continue;
+
+      bvt Aa_nu_bit_is_zero = Aa[k & ~bit_nu];
+      bvt Ab_nu_bit_is_zero = Ab[k & ~bit_nu];
+
+      const std::size_t chi = (k & bits_up_to_nu)
+                              << (address_bits(num_chunks) - 1 - (nu - 1));
+      const std::size_t omega = m % 2 == 1 ? 2 : 4;
+      const std::size_t shift_dist = chi * omega / 2;
+
+      if(nu > 1) // no need to update even indices
+      {
+        Aa[k & ~bit_nu] = cyclic_add(
+          Aa_nu_bit_is_zero, shift(Aa[k], shiftt::ROTATE_LEFT, shift_dist));
+        Ab[k & ~bit_nu] = cyclic_add(
+          Ab_nu_bit_is_zero, shift(Ab[k], shiftt::ROTATE_LEFT, shift_dist));
+        std::cerr << "Aa[" << nu << "](" << (k & ~bit_nu)
+                  << "): " << beautify(Aa[k & ~bit_nu]) << std::endl;
+#if 0
+        std::cerr << "Ab[" << nu << "](" << (k & ~bit_nu)
+                  << "): " << beautify(Ab[k & ~bit_nu]) << std::endl;
+#endif
+      }
+
+      // subtraction mod F_n is addition of subtrahend cyclically shifted 2^n
+      // positions to the left
+      const std::size_t shift_dist_for_sub = shift_dist + ((std::size_t)1 << n);
+      Aa[k] = cyclic_add(
+        Aa_nu_bit_is_zero,
+        shift(Aa[k], shiftt::ROTATE_LEFT, shift_dist_for_sub));
+      Ab[k] = cyclic_add(
+        Ab_nu_bit_is_zero,
+        shift(Ab[k], shiftt::ROTATE_LEFT, shift_dist_for_sub));
+      std::cerr << "Aa[" << nu << "](" << k << "): " << beautify(Aa[k])
+                << std::endl;
+#if 0
+      std::cerr << "Ab[" << nu << "](" << k << "): " << beautify(Ab[k])
+                << std::endl;
+#endif
+    }
+  }
+
+  // Either compute u - v (if u > v), else u - v + 2^2^n + 1
+  auto reduce_to_mod_F_n = [this](const bvt &x)
+  {
+    const std::size_t two_to_power_of_n = x.size() / 2;
+    // std::cerr << "two_to_power_of_n: " << two_to_power_of_n << std::endl;
+    const bvt u =
+      zero_extension(bvt{x.begin(), x.begin() + two_to_power_of_n}, x.size());
+    // std::cerr << "u: " << beautify(u) << std::endl;
+    const bvt v =
+      zero_extension(bvt{x.begin() + two_to_power_of_n, x.end()}, x.size());
+    // std::cerr << "v: " << beautify(v) << std::endl;
+    bvt two_to_power_of_two_to_power_of_n_plus_1 = build_constant(1, x.size());
+    two_to_power_of_two_to_power_of_n_plus_1[two_to_power_of_n] =
+      const_literal(true);
+    const bvt u_ext = select(
+      unsigned_less_than(u, v),
+      add(u, two_to_power_of_two_to_power_of_n_plus_1),
+      u);
+    // std::cerr << "u_ext: " << beautify(u_ext) << std::endl;
+    return sub(u_ext, v);
+  };
+
+  std::vector<bvt> a_hat_k{num_chunks, bvt{}}, b_hat_k{num_chunks, bvt{}};
+  // Reduce by F_n
+  for(std::size_t j = 1; j < num_chunks; j += 2)
+  {
+    a_hat_k[j] = reduce_to_mod_F_n(Aa[j]);
+    std::cerr << "a_hat_k[" << j << "]: " << beautify(a_hat_k[j]) << std::endl;
+    b_hat_k[j] = reduce_to_mod_F_n(Ab[j]);
+    std::cerr << "b_hat_k[" << j << "]: " << beautify(b_hat_k[j]) << std::endl;
+  }
+
+  // Compute point-wise multiplication
+  std::vector<bvt> c_hat_k{num_chunks, bvt{}};
+  for(std::size_t j = 1; j < num_chunks; j += 2)
+  {
+    c_hat_k[j] = unsigned_multiplier(a_hat_k[j], b_hat_k[j]);
+    std::cerr << "c_hat_k[" << j << "]: " << beautify(c_hat_k[j]) << std::endl;
+  }
+
+  // Apply inverse NTT
+  for(std::size_t nu = address_bits(num_chunks) - 1; nu > 0; --nu)
+  {
+    const std::size_t bit_nu_plus_1 = (std::size_t)1 << nu;
+    std::size_t bits_up_to_nu_plus_1 = 0;
+    for(std::size_t i = 0; i < nu; ++i)
+      bits_up_to_nu_plus_1 |= 1 << i;
+
+    // we only need odd ones
+    for(std::size_t k = 1; k < num_chunks; k += 2)
+    {
+      if((k & bit_nu_plus_1) == 0)
+        continue;
+
+      bvt c_hat_k_nu_plus_1_bit_is_zero = c_hat_k[k & ~bit_nu_plus_1];
+
+      c_hat_k[k & ~bit_nu_plus_1] = shift(
+        cyclic_add(c_hat_k_nu_plus_1_bit_is_zero, c_hat_k[k]),
+        shiftt::ROTATE_RIGHT,
+        1);
+      std::cerr << "c_hat_k[" << nu << "](" << (k & ~bit_nu_plus_1)
+                << "): " << beautify(c_hat_k[k & ~bit_nu_plus_1]) << std::endl;
+
+      const std::size_t chi = (k & bits_up_to_nu_plus_1)
+                              << (address_bits(num_chunks) - 1 - nu);
+      const std::size_t omega = m % 2 == 1 ? 2 : 4;
+      const std::size_t shift_dist = chi * omega / 2 + 1;
+      std::cerr << "SHIFT: " << shift_dist << std::endl;
+
+      c_hat_k[k] = shift(
+        cyclic_add(
+          c_hat_k_nu_plus_1_bit_is_zero,
+          shift(c_hat_k[k], shiftt::ROTATE_LEFT, (std::size_t)1 << n)),
+        shiftt::ROTATE_RIGHT,
+        shift_dist);
+      std::cerr << "c_hat_k[" << nu << "](" << k
+                << "): " << beautify(c_hat_k[k]) << std::endl;
+    }
+  }
+  // Reduce by F_n
+  std::vector<bvt> z_j_mod_F_n;
+  z_j_mod_F_n.reserve(num_chunks / 2);
+  for(std::size_t j = 0; j < num_chunks / 2; ++j)
+  {
+    // reverse n - 1 (n - 2 if m is even) bits of j
+    std::size_t k = 0;
+    for(std::size_t nu = 0; nu < address_bits(num_chunks) - 1; ++nu)
+    {
+      k |= (j & (1 << nu)) >> nu;
+      k <<= 1;
+    }
+    k |= 1;
+    std::cerr << "j " << j << " maps to " << k << std::endl;
+    z_j_mod_F_n.push_back(reduce_to_mod_F_n(c_hat_k[k]));
+    std::cerr << "z_j_mod_F_n[" << j << "]: " << beautify(z_j_mod_F_n[j])
+              << std::endl;
+  }
+
+  // Compute final coefficients as eta + delta * F_n where delta = eta - xi for
+  // eta z_j and xi c_hat_k.
+  for(std::size_t j = 0; j < num_chunks / 2; ++j)
+  {
+    bvt eta = z_j_mod_F_n[j];
+    std::cerr << "eta[" << j << "]: " << beautify(eta) << std::endl;
+    bvt xi = z_j[j];
+    std::cerr << "xi[" << j << "]: " << beautify(xi) << std::endl;
+    // TODO: couldn't we do this over just xi.size() bits instead?
+    bvt delta = sub(eta, zero_extension(xi, eta.size()));
+    CHECK_RETURN(delta.size() >= xi.size());
+    delta.resize(xi.size());
+    std::cerr << "delta[" << j << "]: " << beautify(delta) << std::endl;
+    z_j[j] = add(
+      zero_extension(eta, two_to_m_plus_1),
+      add(
+        shift(
+          zero_extension(delta, two_to_m_plus_1),
+          shiftt::SHIFT_LEFT,
+          (std::size_t)1 << n),
+        zero_extension(delta, two_to_m_plus_1)));
+    std::cerr << "z_j[" << j << "]: " << beautify(z_j[j]) << std::endl;
+  }
+
+  bvt result = zeros(two_to_m_plus_1);
+  for(std::size_t j = 0; j < num_chunks / 2; ++j)
+  {
+    if(chunk_size * j >= a.size())
+      break;
+    result = add(result, shift(z_j[j], shiftt::SHIFT_LEFT, chunk_size * j));
+  }
+  std::cerr << "result: " << beautify(result) << std::endl;
+  CHECK_RETURN(result.size() >= a.size());
+  result.resize(a.size());
+  std::cerr << "result resized: " << beautify(result) << std::endl;
+
+  return result;
+}
+
 bvt bv_utilst::unsigned_multiplier_no_overflow(
   const bvt &op0,
   const bvt &op1)
@@ -2180,6 +2559,8 @@ bvt bv_utilst::signed_multiplier(const bvt &op0, const bvt &op1)
   bvt result = unsigned_karatsuba_multiplier(neg0, neg1);
 #elif defined(USE_TOOM_COOK)
   bvt result = unsigned_toom_cook_multiplier(neg0, neg1);
+#elif defined(USE_SCHOENHAGE_STRASSEN)
+  bvt result = unsigned_schoenhage_strassen_multiplier(neg0, neg1);
 #else
   bvt result=unsigned_multiplier(neg0, neg1);
 #endif
@@ -2256,6 +2637,9 @@ bvt bv_utilst::multiplier(
 #elif defined(USE_TOOM_COOK)
   case representationt::UNSIGNED:
     return unsigned_toom_cook_multiplier(op0, op1);
+#elif defined(USE_SCHOENHAGE_STRASSEN)
+  case representationt::UNSIGNED:
+    return unsigned_schoenhage_strassen_multiplier(op0, op1);
 #else
   case representationt::UNSIGNED: return unsigned_multiplier(op0, op1);
 #endif
diff --git a/src/solvers/flattening/bv_utils.h b/src/solvers/flattening/bv_utils.h
index ac376f7e1d7..560849e334e 100644
--- a/src/solvers/flattening/bv_utils.h
+++ b/src/solvers/flattening/bv_utils.h
@@ -82,6 +82,7 @@ class bv_utilst
   bvt unsigned_karatsuba_multiplier(const bvt &op0, const bvt &op1);
   bvt unsigned_karatsuba_full_multiplier(const bvt &op0, const bvt &op1);
   bvt unsigned_toom_cook_multiplier(const bvt &op0, const bvt &op1);
+  bvt unsigned_schoenhage_strassen_multiplier(const bvt &a, const bvt &b);
   bvt signed_multiplier(const bvt &op0, const bvt &op1);
   bvt multiplier(const bvt &op0, const bvt &op1, representationt rep);
   bvt multiplier_no_overflow(

From 7c1f35629a43f4ddc2c9085c61dfb4ba5f6def14 Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Wed, 9 Jul 2025 17:04:22 +0000
Subject: [PATCH 09/13] WIP: Comba multiplication scheme

---
 src/solvers/flattening/bv_utils.cpp | 49 ++++++++++++++++++++++++++++-
 src/solvers/flattening/bv_utils.h   |  1 +
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index 41850dc0b1a..555705e5d96 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -806,6 +806,48 @@ bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
   return add(a, b);
 }
 
+bvt bv_utilst::comba_column_wise(const std::vector<bvt> &pps)
+{
+  PRECONDITION(!pps.empty());
+
+  std::vector<bvt> columns(pps.front().size());
+  for(const auto &pp : pps)
+  {
+    PRECONDITION(pp.size() == pps.front().size());
+    for(std::size_t i = 0; i < pp.size(); ++i)
+    {
+      if(!pp[i].is_false())
+        columns[i].push_back(pp[i]);
+    }
+  }
+
+  bvt result;
+  result.reserve(columns.size());
+
+  for(std::size_t i = 0; i < columns.size(); ++i)
+  {
+    const bvt &column = columns[i];
+
+    if(column.empty())
+      result.push_back(const_literal(false));
+    else
+    {
+      bvt column_sum = popcount(column);
+      CHECK_RETURN(!column_sum.empty());
+      result.push_back(column_sum.front());
+      for(std::size_t j = 1; j < column_sum.size(); ++j)
+      {
+        if(i + j >= columns.size())
+          break;
+        if(!column_sum[j].is_false())
+          columns[i + j].push_back(column_sum[j]);
+      }
+    }
+  }
+
+  return result;
+}
+
 // Wallace tree multiplier. This is disabled, as runtimes have
 // been observed to go up by 5%-10%, and on some models even by 20%.
 // #define WALLACE_TREE
@@ -958,10 +1000,11 @@ bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
 // #define RADIX_MULTIPLIER 8
 // #define USE_KARATSUBA
 // #define USE_TOOM_COOK
-#define USE_SCHOENHAGE_STRASSEN
+// #define USE_SCHOENHAGE_STRASSEN
 #ifdef RADIX_MULTIPLIER
 #  define DADDA_TREE
 #endif
+#define COMBA
 
 #ifdef RADIX_MULTIPLIER
 static bvt unsigned_multiply_by_3(propt &prop, const bvt &op)
@@ -1850,6 +1893,8 @@ bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
     return wallace_tree(pps);
 #elif defined(DADDA_TREE)
     return dadda_tree(pps);
+#elif defined(COMBA)
+    return comba_column_wise(pps);
 #else
     bvt product = pps.front();
 
@@ -2118,6 +2163,8 @@ bvt bv_utilst::unsigned_toom_cook_multiplier(const bvt &_op0, const bvt &_op1)
     return wallace_tree(c_ops);
 #elif defined(DADDA_TREE)
     return dadda_tree(c_ops);
+#elif defined(COMBA)
+    return comba_column_wise(c_ops);
 #else
     bvt product = c_ops.front();
 
diff --git a/src/solvers/flattening/bv_utils.h b/src/solvers/flattening/bv_utils.h
index 560849e334e..58de3dff4ff 100644
--- a/src/solvers/flattening/bv_utils.h
+++ b/src/solvers/flattening/bv_utils.h
@@ -254,6 +254,7 @@ class bv_utilst
 
   bvt wallace_tree(const std::vector<bvt> &pps);
   bvt dadda_tree(const std::vector<bvt> &pps);
+  bvt comba_column_wise(const std::vector<bvt> &pps);
 };
 
 #endif // CPROVER_SOLVERS_FLATTENING_BV_UTILS_H

From 6bb24501feb83684eb7d9d5a1642426542530483 Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Mon, 14 Jul 2025 10:42:06 +0000
Subject: [PATCH 10/13] WIP: Schoenhage-Strassen debugging

---
 src/solvers/flattening/bv_utils.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index 555705e5d96..e9f8077f6a4 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -1000,11 +1000,12 @@ bvt bv_utilst::comba_column_wise(const std::vector<bvt> &pps)
 // #define RADIX_MULTIPLIER 8
 // #define USE_KARATSUBA
 // #define USE_TOOM_COOK
-// #define USE_SCHOENHAGE_STRASSEN
+#define USE_SCHOENHAGE_STRASSEN
 #ifdef RADIX_MULTIPLIER
+//#  define COMBA
 #  define DADDA_TREE
 #endif
-#define COMBA
+// #define COMBA
 
 #ifdef RADIX_MULTIPLIER
 static bvt unsigned_multiply_by_3(propt &prop, const bvt &op)
@@ -2178,8 +2179,9 @@ bvt bv_utilst::unsigned_toom_cook_multiplier(const bvt &_op0, const bvt &_op1)
 
 bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
   const bvt &a,
-  const bvt &b)
+  const bvt &_b)
 {
+  bvt b = a;
   PRECONDITION(a.size() == b.size());
 
   // Running examples: we want to multiple 213 by 15 as 8- or 9-bit integers.

From 04da08135f6dbbb7c2241f8fd3a5e1dcf8cfddaa Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Tue, 15 Jul 2025 08:23:31 +0000
Subject: [PATCH 11/13] fixup! Implement Karatsuba's multiplication algorithm

---
 src/solvers/flattening/bv_utils.cpp | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index e9f8077f6a4..bd0e562e5ca 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -1973,14 +1973,17 @@ bvt bv_utilst::unsigned_karatsuba_multiplier(const bvt &_op0, const bvt &_op1)
     return unsigned_multiplier(_op0, _op1);
 
   const std::size_t op_size = _op0.size();
-  // only use this approach for powers of two
-  if(op_size == 0 || (op_size & (op_size - 1)) != 0)
-    return unsigned_multiplier(_op0, _op1);
-
   if(op_size == 1)
     return {prop.land(_op0[0], _op1[0])};
 
-  const std::size_t half_op_size = op_size >> 1;
+  // Make sure we work with operands the length of which are powers of two
+  const std::size_t log2 = address_bits(op_size);
+  PRECONDITION(sizeof(std::size_t) * CHAR_BIT > log2);
+  const std::size_t two_to_log2 = (std::size_t)1 << log2;
+  bvt a = zero_extension(_op0, two_to_log2);
+  bvt b = zero_extension(_op1, two_to_log2);
+
+  const std::size_t half_op_size = two_to_log2 >> 1;
 
   // We split each of the operands in half and treat them as coefficients of a
   // polynomial a * 2^half_op_size + b. Straightforward polynomial
@@ -2012,10 +2015,10 @@ bvt bv_utilst::unsigned_karatsuba_multiplier(const bvt &_op0, const bvt &_op1)
   // Therefore, after adding (dh + cg) the multiplication can safely be added
   // over just 2 bits.
 
-  bvt x0{_op0.begin(), _op0.begin() + half_op_size};
-  bvt x1{_op0.begin() + half_op_size, _op0.end()};
-  bvt y0{_op1.begin(), _op1.begin() + half_op_size};
-  bvt y1{_op1.begin() + half_op_size, _op1.end()};
+  bvt x0{a.begin(), a.begin() + half_op_size};
+  bvt x1{a.begin() + half_op_size, a.end()};
+  bvt y0{b.begin(), b.begin() + half_op_size};
+  bvt y1{b.begin() + half_op_size, b.end()};
 
   bvt z0 = unsigned_karatsuba_full_multiplier(x0, y0);
   bvt z1 = add(
@@ -2023,7 +2026,11 @@ bvt bv_utilst::unsigned_karatsuba_multiplier(const bvt &_op0, const bvt &_op1)
     unsigned_karatsuba_multiplier(x0, y1));
   bvt z1_full = concatenate(zeros(half_op_size), z1);
 
-  return add(z0, z1_full);
+  bvt result = add(z0, z1_full);
+  CHECK_RETURN(result.size() >= op_size);
+  if(result.size() > op_size)
+    result.resize(op_size);
+  return result;
 }
 
 bvt bv_utilst::unsigned_toom_cook_multiplier(const bvt &_op0, const bvt &_op1)

From 5290671a987fd41d06c81e5a607beb6f51a213f3 Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Tue, 15 Jul 2025 08:23:43 +0000
Subject: [PATCH 12/13] fixup! Symbolic Toom-Cook multiplication

---
 src/solvers/flattening/bv_utils.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index bd0e562e5ca..5e66c14c9ca 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -2035,14 +2035,13 @@ bvt bv_utilst::unsigned_karatsuba_multiplier(const bvt &_op0, const bvt &_op1)
 
 bvt bv_utilst::unsigned_toom_cook_multiplier(const bvt &_op0, const bvt &_op1)
 {
+  PRECONDITION(_op0.size() == _op1.size());
   PRECONDITION(!_op0.empty());
-  PRECONDITION(!_op1.empty());
 
-  if(_op1.size() == 1)
-    return unsigned_multiplier(_op0, _op1);
+  if(_op0.size() == 1)
+    return {prop.land(_op0[0], _op1[0])};
 
   // break up _op0, _op1 in groups of at most GROUP_SIZE bits
-  PRECONDITION(_op0.size() == _op1.size());
 #define GROUP_SIZE 8
   const std::size_t d_bits =
     2 * GROUP_SIZE +

From a08b3f60c836bd7dfc333740bc8312bbfb6386f4 Mon Sep 17 00:00:00 2001
From: Michael Tautschnig <tautschn@amazon.com>
Date: Tue, 15 Jul 2025 09:13:20 +0000
Subject: [PATCH 13/13] fixup! WIP: Schoenhage-Strassen debugging

---
 src/solvers/flattening/bv_utils.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
index 5e66c14c9ca..52e38c0543f 100644
--- a/src/solvers/flattening/bv_utils.cpp
+++ b/src/solvers/flattening/bv_utils.cpp
@@ -2190,7 +2190,7 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
   bvt b = a;
   PRECONDITION(a.size() == b.size());
 
-  // Running examples: we want to multiple 213 by 15 as 8- or 9-bit integers.
+  // Running example: we want to multiply 213 by 15 as 8- or 9-bit integers.
   // That is, we seek to multiply 11010101 (011010101) by 00001111 (000001111).
   //                              ^bit 7 ^bit 0
   // The expected result is 123 as both an 8-bit and 9-bit result (001111011).
@@ -2245,8 +2245,10 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
   {
     a_rho.emplace_back(
       a_ext.begin() + i * chunk_size, a_ext.begin() + (i + 1) * chunk_size);
+    std::cerr << "a_rho[" << i << "]: " << beautify(a_rho.back()) << std::endl;
     b_sigma.emplace_back(
       b_ext.begin() + i * chunk_size, b_ext.begin() + (i + 1) * chunk_size);
+    std::cerr << "b_sigma[" << i << "]: " << beautify(b_sigma.back()) << std::endl;
   }
   // For our example we now have
   // a_rho = [ 0101, 1101, 0000, ..., 0000 ]
@@ -2268,11 +2270,13 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
         ++rho)
     {
       const std::size_t sigma = tau - rho;
+      std::cerr << "Inner multiplication a_" << rho << " * b_" << sigma << std::endl;
       gamma_tau[tau] = add(
         gamma_tau[tau],
+        zero_extension(
         unsigned_multiplier(
-          zero_extension(a_rho[rho], 3 * n + 5),
-          zero_extension(b_sigma[sigma], 3 * n + 5)));
+          zero_extension(a_rho[rho], chunk_size * 2),
+          zero_extension(b_sigma[sigma], chunk_size * 2)), 3 * n + 5));
     }
   }
   // For our example we obtain
@@ -2284,6 +2288,7 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
   c_tau.reserve(num_chunks);
   for(std::size_t tau = 0; tau < num_chunks; ++tau)
   {
+    std::cerr << "gamma_tau[" << tau << "]: " << beautify(gamma_tau[tau]) << std::endl;
     c_tau.push_back(add(gamma_tau[tau], gamma_tau[tau + num_chunks]));
     CHECK_RETURN(c_tau.back().size() >= address_bits(num_chunks) + 1);
     c_tau.back().resize(address_bits(num_chunks) + 1);
@@ -2292,6 +2297,10 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
   // For our example we obtain
   // c_tau = [ 01011, 00011, 0... ]
 
+  // XXX 19*19 seems ok up to here with
+  // http://malte-leip.net/beschreibung_ssa.pdf,
+  // https://de.wikipedia.org/wiki/Sch%C3%B6nhage-Strassen-Algorithmus
+
   // Compute z_j = c_j - c_{j + 2^n} (mod 2^(n + 2)) (mod 2^(n + 1) and c_{j +
   // 2^(n - 1)} when m is even)
   std::vector<bvt> z_j;