@@ -664,14 +664,15 @@ bvt bv_utilst::wallace_tree(const std::vector<bvt> &pps)
664664 INVARIANT (a.size () == b.size (), " groups should be of equal size" );
665665 INVARIANT (a.size () == c.size (), " groups should be of equal size" );
666666
667- bvt s (a.size ()), t (a.size ());
667+ bvt s, t (a.size (), const_literal (false ));
668+ s.reserve (a.size ());
668669
669670 for (std::size_t bit=0 ; bit<a.size (); bit++)
670671 {
671- // \todo reformulate using full_adder
672- s[bit]=prop. lxor ( a[bit], prop. lxor ( b[bit], c[bit]));
673- t[bit]= (bit== 0 )? const_literal ( false ):
674- carry (a [bit- 1 ], b[bit- 1 ], c[bit- 1 ]) ;
672+ literalt carry_out;
673+ s. push_back ( full_adder ( a[bit], b[bit], c[bit], carry_out ));
674+ if (bit + 1 < a. size ())
675+ t [bit + 1 ] = carry_out ;
675676 }
676677
677678 new_pps.push_back (std::move (s));
@@ -687,66 +688,276 @@ bvt bv_utilst::wallace_tree(const std::vector<bvt> &pps)
687688 }
688689}
689690
690- bvt bv_utilst::unsigned_multiplier (const bvt &_op0, const bvt &_op1 )
691+ bvt bv_utilst::dadda_tree (const std::vector< bvt> &pps )
691692{
692- #if 1
693- bvt op0=_op0, op1=_op1;
693+ PRECONDITION (!pps.empty ());
694694
695- if (is_constant (op1))
696- std::swap (op0, op1);
695+ using columnt = std::list<literalt>;
696+ std::vector<columnt> columns (pps.front ().size ());
697+ for (const auto &pp : pps)
698+ {
699+ PRECONDITION (pp.size () == pps.front ().size ());
700+ for (std::size_t i = 0 ; i < pp.size (); ++i)
701+ {
702+ if (!pp[i].is_false ())
703+ columns[i].push_back (pp[i]);
704+ }
705+ }
697706
698- bvt product;
699- product.resize (op0.size ());
707+ std::list<std::size_t > dadda_sequence;
708+ for (std::size_t d = 2 ; d < pps.front ().size (); d = (d * 3 ) / 2 )
709+ dadda_sequence.push_front (d);
700710
701- for (std::size_t i=0 ; i<product.size (); i++)
702- product[i]=const_literal (false );
703-
704- for (std::size_t sum=0 ; sum<op0.size (); sum++)
705- if (op0[sum]!=const_literal (false ))
711+ for (auto d : dadda_sequence)
712+ {
713+ for (auto col_it = columns.begin (); col_it != columns.end ();) // no ++col_it
706714 {
707- bvt tmpop = zeros (sum);
708- tmpop.reserve (op0.size ());
715+ if (col_it->size () <= d)
716+ ++col_it;
717+ else if (col_it->size () == d + 1 )
718+ {
719+ // Dadda prescribes a half adder here, but OPTIMAL_FULL_ADDER makes
720+ // full_adder actually build a half adder when carry-in is false.
721+ literalt carry_out;
722+ col_it->push_back (full_adder (
723+ col_it->front (),
724+ *std::next (col_it->begin ()),
725+ const_literal (false ),
726+ carry_out));
727+ col_it->pop_front ();
728+ col_it->pop_front ();
729+ if (std::next (col_it) != columns.end ())
730+ std::next (col_it)->push_back (carry_out);
731+ }
732+ else
733+ {
734+ // We could also experiment with n:2 compressors (for n > 3, n=3 is the
735+ // full adder as used below) that use circuits with lower gate count
736+ // than just combining multiple full adders.
737+ literalt carry_out;
738+ col_it->push_back (full_adder (
739+ col_it->front (),
740+ *std::next (col_it->begin ()),
741+ *std::next (std::next (col_it->begin ())),
742+ carry_out));
743+ col_it->pop_front ();
744+ col_it->pop_front ();
745+ col_it->pop_front ();
746+ if (std::next (col_it) != columns.end ())
747+ std::next (col_it)->push_back (carry_out);
748+ }
749+ }
750+ }
709751
710- for (std::size_t idx=sum; idx<op0.size (); idx++)
711- tmpop.push_back (prop.land (op1[idx-sum], op0[sum]));
752+ bvt a, b;
753+ a.reserve (pps.front ().size ());
754+ b.reserve (pps.front ().size ());
712755
713- product=add (product, tmpop);
756+ for (const auto &col : columns)
757+ {
758+ PRECONDITION (col.size () <= 2 );
759+ switch (col.size ())
760+ {
761+ case 0 :
762+ a.push_back (const_literal (false ));
763+ b.push_back (const_literal (false ));
764+ break ;
765+ case 1 :
766+ a.push_back (col.front ());
767+ b.push_back (const_literal (false ));
768+ break ;
769+ case 2 :
770+ a.push_back (col.front ());
771+ b.push_back (col.back ());
714772 }
773+ }
715774
716- return product;
717- #else
718- // Wallace tree multiplier. This is disabled, as runtimes have
719- // been observed to go up by 5%-10%, and on some models even by 20%.
775+ return add (a, b);
776+ }
720777
721- // build the usual quadratic number of partial products
778+ // Wallace tree multiplier. This is disabled, as runtimes have
779+ // been observed to go up by 5%-10%, and on some models even by 20%.
780+ // #define WALLACE_TREE
781+ // Dadda' reduction scheme. This yields a smaller formula size than Wallace
782+ // trees (and also the default addition scheme), but remains disabled as it
783+ // isn't consistently more performant either.
784+ // #define DADDA_TREE
785+
786+ // The following examples demonstrate the performance differences (with a
787+ // time-out of 7200 seconds):
788+ //
789+ // #ifndef BITWIDTH
790+ // #define BITWIDTH 8
791+ // #endif
792+ //
793+ // int main()
794+ // {
795+ // __CPROVER_bitvector[BITWIDTH] a, b;
796+ // __CPROVER_assert(a * 3 == a + a + a, "multiplication by 3");
797+ // __CPROVER_assert(3 * a == a + a + a, "multiplication by 3");
798+ // __CPROVER_bitvector[BITWIDTH] ab = a * b;
799+ // __CPROVER_bitvector[BITWIDTH * 2] ab_check =
800+ // (__CPROVER_bitvector[BITWIDTH * 2])a *
801+ // (__CPROVER_bitvector[BITWIDTH * 2])b;
802+ // __CPROVER_assert(
803+ // ab == (__CPROVER_bitvector[BITWIDTH])ab_check, "should pass");
804+ // }
805+ //
806+ // |----|-----------------------------|-----------------------------|
807+ // | | CaDiCaL | MiniSat2 |
808+ // |----|-----------------------------|-----------------------------|
809+ // | n | No tree | Wallace | Dadda | No tree | Wallace | Dadda |
810+ // |----|---------|---------|---------|---------|---------|---------|
811+ // | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
812+ // | 2 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
813+ // | 3 | 0.01 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 |
814+ // | 4 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 |
815+ // | 5 | 0.04 | 0.04 | 0.04 | 0.01 | 0.01 | 0.01 |
816+ // | 6 | 0.11 | 0.13 | 0.12 | 0.04 | 0.05 | 0.06 |
817+ // | 7 | 0.28 | 0.46 | 0.44 | 0.15 | 0.27 | 0.31 |
818+ // | 8 | 0.50 | 1.55 | 1.09 | 0.90 | 1.06 | 1.36 |
819+ // | 9 | 2.22 | 3.63 | 2.67 | 3.40 | 5.85 | 3.44 |
820+ // | 10 | 2.79 | 4.81 | 4.69 | 4.32 | 28.41 | 28.01 |
821+ // | 11 | 8.48 | 4.45 | 11.99 | 38.24 | 98.55 | 86.46 |
822+ // | 12 | 14.52 | 4.86 | 5.80 | 115.00 | 140.11 | 461.70 |
823+ // | 13 | 33.62 | 5.56 | 6.14 | 210.24 | 805.59 | 609.01 |
824+ // | 14 | 37.23 | 6.11 | 8.01 | 776.75 | 689.96 | 6153.82 |
825+ // | 15 | 108.65 | 7.86 | 14.72 | 1048.92 | 1421.74 | 6881.78 |
826+ // | 16 | 102.61 | 14.08 | 18.54 | 1628.01 | timeout | 1943.85 |
827+ // | 17 | 117.89 | 18.53 | 9.19 | 4148.73 | timeout | timeout |
828+ // | 18 | 209.40 | 7.97 | 7.74 | 2760.51 | timeout | timeout |
829+ // | 19 | 168.21 | 18.04 | 15.00 | 2514.21 | timeout | timeout |
830+ // | 20 | 566.76 | 12.68 | 22.47 | 2609.09 | timeout | timeout |
831+ // | 21 | 786.31 | 23.80 | 23.80 | 2232.77 | timeout | timeout |
832+ // | 22 | 817.74 | 17.64 | 22.53 | 3866.70 | timeout | timeout |
833+ // | 23 | 1102.76 | 24.19 | 26.37 | timeout | timeout | timeout |
834+ // | 24 | 1319.59 | 27.37 | 29.95 | timeout | timeout | timeout |
835+ // | 25 | 1786.11 | 27.10 | 29.94 | timeout | timeout | timeout |
836+ // | 26 | 1952.18 | 31.08 | 33.95 | timeout | timeout | timeout |
837+ // | 27 | 6908.48 | 27.92 | 34.94 | timeout | timeout | timeout |
838+ // | 28 | 6919.34 | 36.63 | 33.78 | timeout | timeout | timeout |
839+ // | 29 | timeout | 27.95 | 40.69 | timeout | timeout | timeout |
840+ // | 30 | timeout | 36.94 | 31.59 | timeout | timeout | timeout |
841+ // | 31 | timeout | 38.41 | 40.04 | timeout | timeout | timeout |
842+ // | 32 | timeout | 33.06 | 91.38 | timeout | timeout | timeout |
843+ // |----|---------|---------|---------|---------|---------|---------|
844+ //
845+ // In summary, both Wallace tree and Dadda reduction are substantially more
846+ // efficient with CaDiCaL on the above code for all bit width > 11, but somewhat
847+ // slower with MiniSat.
848+ //
849+ // #ifndef BITWIDTH
850+ // #define BITWIDTH 8
851+ // #endif
852+ //
853+ // int main()
854+ // {
855+ // __CPROVER_bitvector[BITWIDTH] a, b, c, ab, bc, abc;
856+ // ab = a * b;
857+ // __CPROVER_bitvector[BITWIDTH * 2] ab_check =
858+ // (__CPROVER_bitvector[BITWIDTH * 2])a *
859+ // (__CPROVER_bitvector[BITWIDTH * 2])b;
860+ // __CPROVER_assume(ab == ab_check);
861+ // bc = b * c;
862+ // __CPROVER_bitvector[BITWIDTH * 2] bc_check =
863+ // (__CPROVER_bitvector[BITWIDTH * 2])b *
864+ // (__CPROVER_bitvector[BITWIDTH * 2])c;
865+ // __CPROVER_assume(bc == bc_check);
866+ // abc = ab * c;
867+ // __CPROVER_bitvector[BITWIDTH * 2] abc_check =
868+ // (__CPROVER_bitvector[BITWIDTH * 2])ab *
869+ // (__CPROVER_bitvector[BITWIDTH * 2])c;
870+ // __CPROVER_assume(abc == abc_check);
871+ // __CPROVER_assert(abc == a * bc, "associativity");
872+ // }
873+ //
874+ // |----|-----------------------------|-----------------------------|
875+ // | | CaDiCaL | MiniSat2 |
876+ // |----|-----------------------------|-----------------------------|
877+ // | n | No tree | Wallace | Dadda | No tree | Wallace | Dadda |
878+ // |----|---------|---------|---------|---------|---------|---------|
879+ // | 1 | 0.00 | 0.00 | 0.00 | 0.01 | 0.01 | 0.01 |
880+ // | 2 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 |
881+ // | 3 | 0.02 | 0.03 | 0.03 | 0.01 | 0.01 | 0.01 |
882+ // | 4 | 0.05 | 0.07 | 0.06 | 0.02 | 0.02 | 0.02 |
883+ // | 5 | 0.17 | 0.18 | 0.14 | 0.04 | 0.04 | 0.04 |
884+ // | 6 | 0.50 | 0.63 | 0.63 | 0.13 | 0.14 | 0.13 |
885+ // | 7 | 1.01 | 1.15 | 0.90 | 0.43 | 0.47 | 0.47 |
886+ // | 8 | 1.56 | 1.76 | 1.75 | 3.35 | 2.39 | 1.75 |
887+ // | 9 | 2.07 | 2.48 | 1.75 | 11.20 | 12.64 | 7.94 |
888+ // | 10 | 3.58 | 3.88 | 3.54 | 20.23 | 23.49 | 15.66 |
889+ // | 11 | 5.84 | 7.46 | 5.31 | 49.32 | 39.86 | 44.15 |
890+ // | 12 | 11.71 | 16.85 | 13.40 | 69.32 | 156.57 | 46.50 |
891+ // | 13 | 33.22 | 41.95 | 34.43 | 250.91 | 294.73 | 79.47 |
892+ // | 14 | 76.27 | 109.59 | 84.49 | 226.98 | 259.84 | 258.08 |
893+ // | 15 | 220.01 | 330.10 | 267.11 | 783.73 | 1160.47 | 1262.91 |
894+ // | 16 | 591.91 | 981.16 | 808.33 | 2712.20 | 4286.60 | timeout |
895+ // | 17 | 1634.97 | 2574.81 | 2006.27 | timeout | timeout | timeout |
896+ // | 18 | 4680.16 | timeout | 6704.35 | timeout | timeout | timeout |
897+ // | 19 | timeout | timeout | timeout | timeout | timeout | timeout |
898+ // | 20 | timeout | timeout | timeout | timeout | timeout | timeout |
899+ // | 21 | timeout | timeout | timeout | timeout | timeout | timeout |
900+ // | 22 | timeout | timeout | timeout | timeout | timeout | timeout |
901+ // | 23 | timeout | timeout | timeout | timeout | timeout | timeout |
902+ // | 24 | timeout | timeout | timeout | timeout | timeout | timeout |
903+ // | 25 | timeout | timeout | timeout | timeout | timeout | timeout |
904+ // | 26 | timeout | timeout | timeout | timeout | timeout | timeout |
905+ // | 27 | timeout | timeout | timeout | timeout | timeout | timeout |
906+ // | 28 | timeout | timeout | timeout | timeout | timeout | timeout |
907+ // | 29 | timeout | timeout | timeout | timeout | timeout | timeout |
908+ // | 30 | timeout | timeout | timeout | timeout | timeout | timeout |
909+ // | 31 | timeout | timeout | timeout | timeout | timeout | timeout |
910+ // | 32 | timeout | timeout | timeout | timeout | timeout | timeout |
911+ // |----|---------|---------|---------|---------|---------|---------|
912+ //
913+ // In summary, Wallace tree reduction is slower for both solvers and all bit
914+ // widths (except BITWIDTH==8 with MiniSat2). Dadda multipliers get closer to
915+ // our multiplier that's not using a tree reduction scheme, but aren't uniformly
916+ // better either.
722917
918+ bvt bv_utilst::unsigned_multiplier (const bvt &_op0, const bvt &_op1)
919+ {
723920 bvt op0=_op0, op1=_op1;
724921
725922 if (is_constant (op1))
726923 std::swap (op0, op1);
727924
925+ // build the usual quadratic number of partial products
728926 std::vector<bvt> pps;
729927 pps.reserve (op0.size ());
730928
731929 for (std::size_t bit=0 ; bit<op0.size (); bit++)
732- if(op0[bit]!=const_literal(false))
733- {
734- // zeros according to weight
735- bvt pp = zeros(bit);
736- pp.reserve(op0.size());
930+ {
931+ if (op0[bit] == const_literal (false ))
932+ continue ;
737933
738- for(std::size_t idx=bit; idx<op0.size(); idx++)
739- pp.push_back(prop.land(op1[idx-bit], op0[bit]));
934+ // zeros according to weight
935+ bvt pp = zeros (bit);
936+ pp.reserve (op0.size ());
740937
741- pps.push_back(pp);
742- }
938+ for (std::size_t idx = bit; idx < op0.size (); idx++)
939+ pp.push_back (prop.land (op1[idx - bit], op0[bit]));
940+
941+ pps.push_back (pp);
942+ }
743943
744944 if (pps.empty ())
745945 return zeros (op0.size ());
746946 else
947+ {
948+ #ifdef WALLACE_TREE
747949 return wallace_tree (pps);
950+ #elif defined(DADDA_TREE)
951+ return dadda_tree (pps);
952+ #else
953+ bvt product = pps.front ();
748954
749- #endif
955+ for (auto it = std::next (pps.begin ()); it != pps.end (); ++it)
956+ product = add (product, *it);
957+
958+ return product;
959+ #endif
960+ }
750961}
751962
752963bvt bv_utilst::unsigned_multiplier_no_overflow (
0 commit comments