@@ -140,7 +140,8 @@ template<typename TFloat,
140140      bool  bWeight,
141141      size_t  cCompilerScores,
142142      int  cCompilerPack,
143-       typename  std::enable_if<bParallel && !bCollapsed && 1  == cCompilerScores, int >::type = 0 >
143+       typename  std::enable_if<bParallel && !bCollapsed && 1  == cCompilerScores && (!bHessian || 8  != TFloat::TInt::k_cSIMDPack),
144+             int >::type = 0 >
144145GPU_DEVICE NEVER_INLINE static  void  BinSumsBoostingInternal (BinSumsBoostingBridge* const  pParams) {
145146
146147   static_assert (1  == cCompilerScores, " This specialization of BinSumsBoostingInternal cannot handle multiclass."  );
@@ -362,42 +363,14 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
362363}
363364
364365
365- /* 
366- This speculative version is a specialization where bHessian is true.  When there is 
367- a hessian instead of loading the gradients from the bins first then hessians after that we 
368- can use a gathering load to get the gradient and hessian for the first 1/2 of the 
369- samples, then get the next 1/2 of the samples.  The potential benefit is that 
370- the CPU might benefit from getting gradients and hessians that are located next 
371- to eachother.  Unfortunately this seems to introduce a data dependency issue since 
372- the first 1/2 of the gradients/hessians need to be loaded/modified/stored before the 
373- next 1/2 can be worked on. This seems slower but more investigation is required 
374- 
375- 
376- inline void PermuteForInterleaf(Avx2_32_Int& v0, Avx2_32_Int& v1) const noexcept { 
377-    // this function permutes the values into positions that the Interleaf function expects 
378-    // but for any SIMD implementation the positions can be variable as long as they work together 
379-    v0 = Avx2_32_Int(_mm256_permutevar8x32_epi32(m_data, _mm256_setr_epi32(0, 0, 1, 1, 4, 4, 5, 5))); 
380-    v1 = Avx2_32_Int(_mm256_permutevar8x32_epi32(m_data, _mm256_setr_epi32(2, 2, 3, 3, 6, 6, 7, 7))); 
381- } 
382- inline static Avx2_32_Int MakeAlternating() noexcept { return Avx2_32_Int(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)); } 
383- inline static Avx2_32_Int MakeHalfIndexes() noexcept { return Avx2_32_Int(_mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); } 
384- 
385- inline static void Interleaf(Avx2_32_Float& val0, Avx2_32_Float& val1) noexcept { 
386-    // this function permutes the values into positions that the PermuteForInterleaf function expects 
387-    // but for any SIMD implementation, the positions can be variable as long as they work together 
388-    __m256 temp = _mm256_unpacklo_ps(val0.m_data, val1.m_data); 
389-    val1 = Avx2_32_Float(_mm256_unpackhi_ps(val0.m_data, val1.m_data)); 
390-    val0 = Avx2_32_Float(temp); 
391- } 
392- 
393366template <typename  TFloat,
394367      bool  bParallel,
395368      bool  bCollapsed,
396369      bool  bHessian,
397370      bool  bWeight,
398371      size_t  cCompilerScores,
399372      int  cCompilerPack,
400-       typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores && bHessian, int>::type = 0> 
373+       typename  std::enable_if<bParallel && !bCollapsed && 1  == cCompilerScores && bHessian &&  8  == TFloat::TInt::k_cSIMDPack , int >::type = 0 >
401374GPU_DEVICE NEVER_INLINE static  void  BinSumsBoostingInternal (BinSumsBoostingBridge* const  pParams) {
402375
403376   static_assert (1  == cCompilerScores, " This specialization of BinSumsBoostingInternal cannot handle multiclass."  );
@@ -419,8 +392,7 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
419392
420393   const  typename  TFloat::T* pGradientAndHessian =
421394         reinterpret_cast <const  typename  TFloat::T*>(pParams->m_aGradientsAndHessians );
422-    const typename TFloat::T* const pGradientsAndHessiansEnd = 
423-          pGradientAndHessian + size_t{2} * cSamples; 
395+    const  typename  TFloat::T* const  pGradientsAndHessiansEnd = pGradientAndHessian + size_t {2 } * cSamples;
424396
425397   static  constexpr  typename  TFloat::TInt::T cBytesPerBin = static_cast <typename  TFloat::TInt::T>(
426398         GetBinSize<typename  TFloat::T, typename  TFloat::TInt::T>(false , false , bHessian, size_t {1 }));
@@ -462,8 +434,25 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
462434
463435   EBM_ASSERT (0  == pParams->m_cBytesFastBins  % static_cast <size_t >(cBytesPerBin));
464436
437+    //  The compiler is normally pretty good about optimizing multiplications into shifts when possible
438+    //  BUT, when compiling for SIMD, it seems to use a SIMD multiplication instruction instead of shifts
439+    //  even when the multiplication has a fixed compile time constant value that is a power of 2, so
440+    //  we manually convert the multiplications into shifts.
441+    // 
442+    //  We also have tried the Multiply templated function that is designed to convert multiplications
443+    //  into shifts, but using that templated function breaks the compiler optimization that unrolls
444+    //  the bitpacking loop.
445+    // 
446+    constexpr  static  bool  bSmall = 4  == cBytesPerBin;
447+    constexpr  static  bool  bMed = 8  == cBytesPerBin;
448+    constexpr  static  bool  bLarge = 16  == cBytesPerBin;
449+    static_assert (bSmall || bMed || bLarge, " cBytesPerBin size must be small, medium, or large"  );
450+    constexpr  static  int  cFixedShift = bSmall ? 2  : bMed ? 3  : 4 ;
451+    static_assert (1  << cFixedShift == cBytesPerBin, " cFixedShift must match the BinSize"  );
452+    EBM_ASSERT (0  == pParams->m_cBytesFastBins  % static_cast <size_t >(cBytesPerBin));
453+ 
465454   const  typename  TFloat::TInt offsets =
466-          TFloat::TInt::MakeHalfIndexes () * static_cast<typename TFloat::TInt::T>(pParams->m_cBytesFastBins); 
455+          TFloat::TInt::MakeIndexes  () * static_cast <typename  TFloat::TInt::T>(pParams->m_cBytesFastBins  >> cFixedShift );
467456
468457   do  {
469458      //  TODO: maybe, it might be useful to preload the iTensorBinCombined, weight, gradient, hessian for the next loop
@@ -506,36 +495,30 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
506495         TFloat gradhess1 = TFloat::Load (&pGradientAndHessian[TFloat::k_cSIMDPack]);
507496         pGradientAndHessian += size_t {2 } * TFloat::k_cSIMDPack;
508497
509-          TFloat::Interleaf(gradhess0, gradhess1); 
510- 
511498         if (bWeight) {
512499            gradhess0 *= weight;
513500            gradhess1 *= weight;
514501         }
515502
516-          typename TFloat::TInt iTensorBin = 
517-                (((iTensorBinCombined >> cShift) & maskBits) << (TFloat::k_cTypeShift + 1)) + offsets; 
503+          TFloat::Interleaf (gradhess0, gradhess1);
504+ 
505+          typename  TFloat::TInt iTensorBin = ((iTensorBinCombined >> cShift) & maskBits) + offsets;
506+ 
507+          iTensorBin = PermuteForInterleaf (iTensorBin);
518508
519509         //  TODO: instead of loading the gradient and hessian as separate loads, it might be better to
520510         //  load the gradients and hessians as part as the same gather load because the CPU might
521511         //  be better at loading items from the same cache line, and also because it would reduce our
522512         //  memory by a factor of 2x since we could then handle two items in this loop sequentially
523513         //  The drawback is that we need to shuffle our indexes and gradient/hessians values that we get back
524514
525-          typename TFloat::TInt iTensorBin0; 
526-          typename TFloat::TInt iTensorBin1; 
527-          iTensorBin.PermuteForInterleaf(iTensorBin0, iTensorBin1); 
515+          TFloat bin0;
516+          TFloat bin1;
528517
529-          iTensorBin0 = iTensorBin0 + TFloat::TInt::MakeAlternating() * sizeof(TFloat::TInt::T); 
530-          iTensorBin1 = iTensorBin1 + TFloat::TInt::MakeAlternating() * sizeof(TFloat::TInt::T); 
531- 
532-          TFloat bin0 = TFloat::template Load<0>(aBins, iTensorBin0); 
518+          TFloat::template  DoubleLoad<cFixedShift>(aBins, iTensorBin, bin0, bin1);
533519         bin0 += gradhess0;
534-          bin0.template Store<0>(aBins, iTensorBin0); 
535- 
536-          TFloat bin1 = TFloat::template Load<0>(aBins, iTensorBin1); 
537520         bin1 += gradhess1;
538-          bin1. template Store<0 >(aBins, iTensorBin1 ); 
521+          TFloat:: template  DoubleStore<cFixedShift >(aBins, iTensorBin, bin0, bin1 );
539522
540523         cShift -= cBitsPerItemMax;
541524      } while (0  <= cShift);
@@ -544,7 +527,6 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
544527      }
545528   } while (pGradientsAndHessiansEnd != pGradientAndHessian);
546529}
547- */ 
548530
549531template <typename  TFloat,
550532      bool  bParallel,
0 commit comments