numpy
diff --git a/‎npsr/common.h
Lines changed: 26 additions & 0 deletions b/‎npsr/common.h
Lines changed: 26 additions & 0 deletions
diff --git a/‎npsr/precise.h
Lines changed: 1 addition & 36 deletions b/‎npsr/precise.h
Lines changed: 1 addition & 36 deletions
diff --git a/‎npsr/trig/data/approx.h
Lines changed: 1563 additions & 0 deletions b/‎npsr/trig/data/approx.h
Lines changed: 1563 additions & 0 deletions
diff --git a/‎npsr/trig/data/approx.h.sol
Lines changed: 52 additions & 0 deletions b/‎npsr/trig/data/approx.h.sol
Lines changed: 52 additions & 0 deletions
diff --git a/‎npsr/trig/data/constants.h
Lines changed: 57 additions & 0 deletions b/‎npsr/trig/data/constants.h
Lines changed: 57 additions & 0 deletions
diff --git a/‎npsr/trig/data/constants.h.sol
Lines changed: 87 additions & 0 deletions b/‎npsr/trig/data/constants.h.sol
Lines changed: 87 additions & 0 deletions
diff --git a/‎npsr/trig/data/data.h
Lines changed: 11 additions & 0 deletions b/‎npsr/trig/data/data.h
Lines changed: 11 additions & 0 deletions
diff --git a/‎npsr/trig/data/data.h.sol
Lines changed: 10 additions & 0 deletions b/‎npsr/trig/data/data.h.sol
Lines changed: 10 additions & 0 deletions
diff --git a/‎npsr/trig/data/high.h
Lines changed: 54 additions & 0 deletions b/‎npsr/trig/data/high.h
Lines changed: 54 additions & 0 deletions
@@ -9,3 +9,29 @@
 #include "precise.h"
 
 #endif  // NUMPY_SIMD_ROUTINES_NPSR_COMMON_H_
+
+#if defined(NUMPY_SIMD_ROUTINES_NPSR_COMMON_FOREACH_H_) == \
+    defined(HWY_TARGET_TOGGLE)  // NOLINT
+#ifdef NUMPY_SIMD_ROUTINES_NPSR_COMMON_FOREACH_H_
+#undef NUMPY_SIMD_ROUTINES_NPSR_COMMON_FOREACH_H_
+#else
+#define NUMPY_SIMD_ROUTINES_NPSR_COMMON_FOREACH_H_
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace npsr::HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+using hn::DFromV;
+using hn::MFromD;
+using hn::Rebind;
+using hn::RebindToUnsigned;
+using hn::TFromD;
+using hn::TFromV;
+using hn::VFromD;
+constexpr bool kNativeFMA = HWY_NATIVE_FMA != 0;
+
+HWY_ATTR void DummyToSuppressUnusedWarning() {}
+}  // namespace npsr::HWY_NAMESPACE
+HWY_AFTER_NAMESPACE();
+
+#endif  // NUMPY_SIMD_ROUTINES_NPSR_COMMON_FOREACH_H_
@@ -18,27 +18,15 @@ constexpr auto kLowAccuracy = _LowAccuracy{};
 
 struct Round {
   struct _Force {};
-  struct _Nearest {};
-  struct _Down {};
-  struct _Up {};
-  struct _Zero {};
   static constexpr auto kForce = _Force{};
-  static constexpr auto kNearest = _Nearest{};
-#if 0  // not used yet
-  static constexpr auto kDown = _Down{};
-  static constexpr auto kUp = _Up{};
-  static constexpr auto kZero = _Zero{};
-#endif
 };
 
 struct Subnormal {
   struct _DAZ {};
   struct _FTZ {};
   struct _IEEE754 {};
-#if 0  // not used yet
   static constexpr auto kDAZ = _DAZ{};
   static constexpr auto kFTZ = _FTZ{};
-#endif
   static constexpr auto kIEEE754 = _IEEE754{};
 };
 
@@ -137,19 +125,6 @@ class Precise {
 
   static constexpr bool kRoundForce =
       (std::is_same_v<Round::_Force, Args> || ...);
-  static constexpr bool _kRoundNearest =
-      (std::is_same_v<Round::_Nearest, Args> || ...);
-  static constexpr bool kRoundZero =
-      (std::is_same_v<Round::_Zero, Args> || ...);
-  static constexpr bool kRoundDown =
-      (std::is_same_v<Round::_Down, Args> || ...);
-  static constexpr bool kRoundUp = (std::is_same_v<Round::_Up, Args> || ...);
-  // only one rounding mode can be set
-  static_assert((_kRoundNearest + kRoundDown + kRoundUp + kRoundZero) <= 1,
-                "Only one rounding mode can be set at a time");
-  // if no rounding mode is set, default to round nearest
-  static constexpr bool kRoundNearest =
-      _kRoundNearest || (!kRoundDown && !kRoundUp && !kRoundZero);
 
   static constexpr bool kDAZ = (std::is_same_v<Subnormal::_DAZ, Args> || ...);
   static constexpr bool kFTZ = (std::is_same_v<Subnormal::_FTZ, Args> || ...);
@@ -162,17 +137,7 @@ class Precise {
   static constexpr bool kIEEE754 = _kIEEE754 || !(kDAZ || kFTZ);
 
  private:
-  int _NewRoundingMode() const {
-    if constexpr (kRoundDown) {
-      return FE_DOWNWARD;
-    } else if constexpr (kRoundUp) {
-      return FE_UPWARD;
-    } else if constexpr (kRoundZero) {
-      return FE_TOWARDZERO;
-    } else {
-      return FE_TONEAREST;
-    }
-  }
+  int _NewRoundingMode() const { return FE_TONEAREST; }
   int _rounding_mode = 0;
   bool _retrieve_rounding_mode = false;
   fexcept_t _exceptions;
 
@@ -0,0 +1,52 @@
+suppressmessage(186, 185, 184);
+
+procedure ApproxLut4_(pT, pFunc, pFuncDriv) {
+  var r, i, $;
+  
+  $.num_lut = match pT.kSize
+    with 64: (2^9)
+    default: (2^8);
+
+  $.low_round = match pT.kSize 
+    with 64: ([|24, RZ|])
+    default: ([|pT.kDigits, RN|]);
+  $.scale = 2.0 * pi / $.num_lut;
+  
+  r = [||];
+  for i from 0 to $.num_lut - 1 do {
+    $.angle = i * $.scale;
+    $.exact = pFunc($.angle);
+    $.high = pT.kRound($.exact);
+    $.low = pT.kRound(round($.exact - $.high, $.low_round[0], $.low_round[1]));
+    
+    $.deriv_exact = pFuncDriv($.angle);
+    $.k = ceil(log2(abs($.deriv_exact)));
+    if ($.deriv_exact < 0) then $.k = -$.k;
+    
+    $.sigma = 2.0^$.k;
+    $.deriv = pT.kRound($.deriv_exact - $.sigma);
+    r = r @ [|$.deriv, $.sigma, $.high, $.low|];
+  };
+  return ToStringCArray(r, pT.kCSFX, 4);
+};
+
+Append(
+  "template <typename T> constexpr char kSinApproxTable[] = {};",
+  "template <> constexpr float kSinApproxTable<float>[] = ",
+  ApproxLut4_(Float32, sin(x), cos(x)),
+  "",
+  "template <> constexpr double kSinApproxTable<double>[] = ",
+  ApproxLut4_(Float64, sin(x), cos(x)),
+  ""
+);
+Append(
+  "template <typename T> constexpr char kCosApproxTable[] = {};",
+  "template <> constexpr float kCosApproxTable<float>[] = ",
+  ApproxLut4_(Float32, cos(x), -sin(x)),
+  "",
+  "template <> constexpr double kCosApproxTable<double>[] = ",
+  ApproxLut4_(Float64, cos(x), -sin(x)),
+  ""
+);
+
+WriteCPPHeader("npsr::trig::data");
@@ -0,0 +1,57 @@
+// Auto-generated by npsr/trig/data/constants.h.sol
+// Use `spin generate -f` to force regeneration
+#ifndef NPSR_TRIG_DATA_CONSTANTS_H
+#define NPSR_TRIG_DATA_CONSTANTS_H
+
+namespace npsr::trig::data {
+template <typename T, bool FMA> constexpr char kPi[] = {};
+template <> constexpr float kPi<float, true>[] = {
+  0x1.921fb6p1f, -0x1.777a5cp-24f, -0x1.ee59dap-49f, 
+};
+template <> constexpr float kPi<float, false>[] = {
+  0x1.92p1f, 0x1.fb4p-11f, 0x1.444p-23f, 0x1.68c234p-38f, 
+};
+template <> constexpr double kPi<double, true>[] = {
+  0x1.921fb54442d18p1, 0x1.1a62633145c06p-53, 0x1.c1cd129024e09p-106, 
+};
+template <> constexpr double kPi<double, false>[] = {
+  0x1.921fb6p1, -0x1.777a5cp-24, -0x1.ee59dap-49, 0x1.98a2e03707345p-76, 
+};
+
+template <bool FMA> constexpr double kPiPrec35[] = {
+  0x1.921fb5444p1, 0x1.68c234c4c6628p-38, 
+};
+template <> constexpr double kPiPrec35<false>[] = {
+  0x1.921fb6p1, -0x1.777a5cp-24, -0x1.ee59dap-49, 
+};
+
+template <typename T> constexpr char kPiMul2[] = {};
+template <> constexpr float kPiMul2<float>[] = {
+  0x1.921fb6p2f, -0x1.777a5cp-23f, 
+};
+template <> constexpr double kPiMul2<double>[] = {
+  0x1.921fb54442d18p2, 0x1.1a62633145c07p-52, 
+};
+
+template <bool FMA> constexpr double kPiDiv16Prec29[] = {
+  0x1.921fb54442d18p-3, 0x1.1a62633p-57, 0x1.45c06e0e68948p-89, 
+};
+template <> constexpr double kPiDiv16Prec29<false>[] = {
+  0x1.921fb54p-3, 0x1.a626331p-61, 0x1.1701b839a252p-91, 0x1.10b461p-33, 
+};
+
+template <typename T> constexpr char kInvPi = '_';
+template <> constexpr float kInvPi<float> = 0x1.45f306p-2f;
+template <> constexpr double kInvPi<double> = 0x1.45f306dc9c883p-2;
+
+template <typename T> constexpr char kHalfPi = '_';
+template <> constexpr float kHalfPi<float> = 0x1.921fb6p0f;
+template <> constexpr double kHalfPi<double> = 0x1.921fb54442d18p0;
+
+template <typename T> constexpr char k16DivPi = '_';
+template <> constexpr float k16DivPi<float> = 0x1.45f306p2f;
+template <> constexpr double k16DivPi<double> = 0x1.45f306dc9c883p2;
+
+} // namespace npsr::trig::data
+
+#endif // NPSR_TRIG_DATA_CONSTANTS_H
@@ -0,0 +1,87 @@
+procedure ConstantsToArrayF32_(pArgs = ...) {
+  return ToStringCArray(ConstantsFromArray(pArgs), "f", 4);
+};
+procedure ConstantsToArrayF64_(pArgs = ...) {
+  return ToStringCArray(ConstantsFromArray(pArgs), "", 4);
+};
+
+Append(
+  "template <typename T, bool FMA> constexpr char kPi[] = {};",
+
+  "template <> constexpr float kPi<float, true>[] = " @
+  ConstantsToArrayF32_(pi, [|RN, 24, 24, 24|]),
+  "template <> constexpr float kPi<float, false>[] = " @
+  ConstantsToArrayF32_(pi, [|RD, 11, 11, 11|], [|RN, 24|]), // no FMA
+  
+  
+  "template <> constexpr double kPi<double, true>[] = " @
+  ConstantsToArrayF64_(pi, [|RN, 53|], [|RD, 53|], [|RU, 53|]),
+  "template <> constexpr double kPi<double, false>[] = " @
+  ConstantsToArrayF64_(pi, [|RN, 24, 24, 24|], [|RN, 53|]), // no FMA
+  
+  ""
+);
+
+Append(
+  "template <bool FMA> constexpr double kPiPrec35[] = " @
+  ConstantsToArrayF64_(pi, [|RN, 35|], [|RD, 53|]),
+  "template <> constexpr double kPiPrec35<false>[] = " @
+  ConstantsToArrayF64_(pi, [|RN, 24, 24, 24|]),
+  ""
+);
+
+Append(
+  "template <typename T> constexpr char kPiMul2[] = {};",
+  
+  "template <> constexpr float kPiMul2<float>[] = " @
+  ConstantsToArrayF32_(pi*2, [|RN, 24, 24|]),
+  "template <> constexpr double kPiMul2<double>[] = " @
+  ConstantsToArrayF64_(pi*2, [|RN, 53, 53|]),
+  "" 
+);
+
+vNFma = Constants(pi/16, [|RN, 27, 27|], [|RN, 29|], [|RN, 53|]);
+Append(
+  "template <bool FMA> constexpr double kPiDiv16Prec29[] = " @
+  ConstantsToArrayF64_(pi/16, [|RN, 53|], [|RN, 29|], [|RN, 53|]),
+  "template <> constexpr double kPiDiv16Prec29<false>[] = " @
+  ToStringCArray([|vNFma[0], vNFma[2], vNFma[3], vNFma[1]|], "", 4),
+  ""
+);
+
+Append(
+  "template <typename T> constexpr char kInvPi = '_';",
+  "template <> constexpr float kInvPi<float> = " @
+  single(1/pi) @ "f;",
+  
+  "template <> constexpr double kInvPi<double> = " @
+  double(1/pi) @ ";",
+  ""
+); 
+
+Append(
+  "template <typename T> constexpr char kHalfPi = '_';",
+  
+  "template <> constexpr float kHalfPi<float> = " @
+  single(pi/2) @ "f;",
+  
+  "template <> constexpr double kHalfPi<double> = " @
+  double(pi/2) @ ";",
+  "" 
+);
+
+Append(
+  "template <typename T> constexpr char k16DivPi = '_';",
+  
+  "template <> constexpr float k16DivPi<float> = " @
+  single(16/pi) @ "f;",
+
+  "template <> constexpr double k16DivPi<double> = " @
+  double(16/pi) @ ";",
+  ""
+); 
+
+// Dump();
+
+WriteCPPHeader("npsr::trig::data");
+
@@ -0,0 +1,11 @@
+// Auto-generated by npsr/trig/data/data.h.sol
+// Use `spin generate -f` to force regeneration
+#ifndef NPSR_TRIG_DATA_DATA_H
+#define NPSR_TRIG_DATA_DATA_H
+
+#include "npsr/trig/data/constants.h"
+#include "npsr/trig/data/high.h"
+#include "npsr/trig/data/approx.h"
+#include "npsr/trig/data/reduction.h"
+
+#endif // NPSR_TRIG_DATA_DATA_H
@@ -0,0 +1,10 @@
+var header;
+for header in [|"constants", "high", "approx", "reduction"|] do {
+  Append(
+    "#include \"npsr/trig/data/" @ header @ ".h\""
+  );
+};
+
+WriteCPPHeader();
+
+
@@ -0,0 +1,54 @@
+// Auto-generated by npsr/trig/data/high.h.sol
+// Use `spin generate -f` to force regeneration
+#ifndef NPSR_TRIG_DATA_HIGH_H
+#define NPSR_TRIG_DATA_HIGH_H
+
+namespace npsr::trig::data {
+constexpr double kHiSinKPi16Table[] = {
+  0,                    
+  0x1.8f8b83c69a60bp-3, 
+  0x1.87de2a6aea963p-2, 
+  0x1.1c73b39ae68c8p-1, 
+  0x1.6a09e667f3bcdp-1, 
+  0x1.a9b66290ea1a3p-1, 
+  0x1.d906bcf328d46p-1, 
+  0x1.f6297cff75cbp-1,  
+  0x1p0,                
+  0x1.f6297cff75cbp-1,  
+  0x1.d906bcf328d46p-1, 
+  0x1.a9b66290ea1a3p-1, 
+  0x1.6a09e667f3bcdp-1, 
+  0x1.1c73b39ae68c8p-1, 
+  0x1.87de2a6aea963p-2, 
+  0x1.8f8b83c69a60bp-3, 
+};
+
+constexpr double kHiCosKPi16Table[] = {
+  0x1p0,                 
+  0x1.f6297cff75cbp-1,   
+  0x1.d906bcf328d46p-1,  
+  0x1.a9b66290ea1a3p-1,  
+  0x1.6a09e667f3bcdp-1,  
+  0x1.1c73b39ae68c8p-1,  
+  0x1.87de2a6aea963p-2,  
+  0x1.8f8b83c69a60bp-3,  
+  0,                     
+  -0x1.8f8b83c69a60bp-3, 
+  -0x1.87de2a6aea963p-2, 
+  -0x1.1c73b39ae68c8p-1, 
+  -0x1.6a09e667f3bcdp-1, 
+  -0x1.a9b66290ea1a3p-1, 
+  -0x1.d906bcf328d46p-1, 
+  -0x1.f6297cff75cbp-1,  
+};
+
+constexpr double kPackedLowSinCosKPi16Table[] = {
+  0,                      0x1.56217bc626d19p-56,  0x1.457e6bc672cedp-56,  0x1.9f6303c8b25ddp-60,  
+  -0x1.bdd34bc8bdd34p-55, 0x1.b25dd3c39f63p-55,   -0x1.72ced3c7457e6p-57, -0x1.26d193c756217p-57, 
+  0,                      0x1.26d193c756217p-57,  0x1.72ced3c7457e6p-57,  -0x1.b25dd3c39f63p-55,  
+  0x1.bdd34bc8bdd34p-55,  -0x1.9f6303c8b25ddp-60, -0x1.457e6bc672cedp-56, -0x1.56217bc626d19p-56, 
+};
+
+} // namespace npsr::trig::data
+
+#endif // NPSR_TRIG_DATA_HIGH_H