From b99d9dc88faa8b031b74f1db4e3d675de8696f18 Mon Sep 17 00:00:00 2001
From: Kai Germaschewski <kai.germaschewski@unh.edu>
Date: Tue, 27 Sep 2022 00:29:20 -0400
Subject: [PATCH 1/3] test_lapack: directly make test matrices / pivot vectors

---
 tests/test_lapack.cxx | 199 +++++++++++-------------------------------
 1 file changed, 51 insertions(+), 148 deletions(-)
diff --git a/tests/test_lapack.cxx b/tests/test_lapack.cxx
index 67fa4609..0f68023b 100644
--- a/tests/test_lapack.cxx
+++ b/tests/test_lapack.cxx
@@ -6,144 +6,44 @@
 
 #include "test_helpers.h"
 
-template <typename T>
-void set_A0(T&& h_A)
-{
-  // matlab/octave:
-  //  A = [1 2 2; 4 4 2; 4 6 4];
-  //  L,U,p = lu(A)
-  // first column
-  h_A(0, 0) = 1;
-  h_A(1, 0) = 4;
-  h_A(2, 0) = 4;
-  // second column
-  h_A(0, 1) = 2;
-  h_A(1, 1) = 4;
-  h_A(2, 1) = 6;
-  // third column
-  h_A(0, 2) = 2;
-  h_A(1, 2) = 2;
-  h_A(2, 2) = 4;
-}
+using namespace std::complex_literals;
 
-template <typename T>
-void set_A0_nopiv(T&& h_A)
-{
-  // matlab/octave:
-  // does not require pivoting even if pivoting is possible
-  //  A = [1 0 0; 1 2 0; 0 2 3];
-  //  L,U,p = lu(A)
-  // first column
-  h_A(0, 0) = 1.;
-  h_A(1, 0) = 0.;
-  h_A(2, 0) = 0.;
-  // second column
-  h_A(0, 1) = 1.;
-  h_A(1, 1) = 2.;
-  h_A(2, 1) = 0.;
-  // third column
-  h_A(0, 2) = 0.;
-  h_A(1, 2) = 2.;
-  h_A(2, 2) = 3.;
-}
+// matlab/octave:
+//  L,U,p = lu(A)
 
-template <typename T>
-void set_A0_LU(T&& h_A)
-{
-  // first column factored
-  h_A(0, 0) = 4.0;
-  h_A(1, 0) = 1.0;
-  h_A(2, 0) = 0.25;
-  // second column
-  h_A(0, 1) = 4.0;
-  h_A(1, 1) = 2.0;
-  h_A(2, 1) = 0.5;
-  // thrid column
-  h_A(0, 2) = 2.0;
-  h_A(1, 2) = 2.0;
-  h_A(2, 2) = 0.5;
-}
+// clang-format off
+// A0 = [1 2 2; 4 4 2; 4 6 4];
+gt::gtensor<double, 2> A0{{1, 4, 4},
+                          {2, 4, 6},
+                          {2, 2, 4}};
 
-template <typename T>
-void set_A0_piv(T&& h_p)
-{
-  h_p(0) = 2;
-  h_p(1) = 3;
-  h_p(2) = 3;
-}
+gt::gtensor<double, 2> A0_LU{{4.0, 1.0, 0.25},
+                             {4.0, 2.0, 0.5 },
+                             {2.0, 2.0, 0.5 }};
 
-template <typename C>
-void set_A1_complex(C&& h_A1)
-{
-  using T = typename C::value_type;
-
-  // second matrix, complex
-  // matlab/octave:
-  //  B = [1+i 2-i 2; 4i 4 2; 4 6i 4];
-  //  L,U,p = lu(A2);
-  // first column
-  h_A1(0, 0) = T(1, 1);
-  h_A1(1, 0) = T(0, 4);
-  h_A1(2, 0) = T(4, 0);
-  // second column
-  h_A1(0, 1) = T(2, -1);
-  h_A1(1, 1) = T(4, 0);
-  h_A1(2, 1) = T(0, 6);
-  // third column
-  h_A1(0, 2) = T(2, 0);
-  h_A1(1, 2) = T(2, 0);
-  h_A1(2, 2) = T(4, 0);
-}
+gt::gtensor<int, 1> A0_piv{2, 3, 3};
 
-template <typename C>
-void set_A1_complex_nopiv(C&& h_A1)
-{
-  using T = typename C::value_type;
-
-  // second matrix, complex
-  // matlab/octave:
-  //  B = [1+i 2-i 2; 4i 4 2; 4 6i 4];
-  //  L,U,p = lu(A2);
-  // first column
-  h_A1(0, 0) = T(1, 0);
-  h_A1(1, 0) = T(0, 0);
-  h_A1(2, 0) = T(0, 0);
-  // second column
-  h_A1(0, 1) = T(1, 0);
-  h_A1(1, 1) = T(2, 0);
-  h_A1(2, 1) = T(0, 0);
-  // third column
-  h_A1(0, 2) = T(0, 0);
-  h_A1(1, 2) = T(2, 0);
-  h_A1(2, 2) = T(3, 0);
-}
+// A0_nopiv = [1 0 0; 1 2 0; 0 2 3];
+gt::gtensor<double, 2> A0_nopiv{{1, 0, 0},
+                                {1, 2, 0},
+                                {0, 2, 3}};
 
-template <typename C>
-void set_A1_LU_complex(C&& h_A1)
-{
-  using T = typename C::value_type;
+// A1 = [1+i 2-i 2; 4i 4 2; 4 6i 4];
+gt::gtensor<gt::complex<double>, 2> A1{{1. + 1.i, 4.i, 4. },
+                                       {2. - 1.i,  4., 6.i},
+                                       {2.      ,  2., 4. }};
+// A1_LU
+gt::gtensor<gt::complex<double>, 2> A1_LU{{4.i,    - 1.i, 0.25 - 0.25i},
+                                          {4. ,     10.i,      - 0.10i},
+                                          {2. , 4. + 2.i, 1.30 + 0.90i}};
 
-  // first column
-  h_A1(0, 0) = T(0, 4);
-  h_A1(1, 0) = T(0, -1);
-  h_A1(2, 0) = T(0.25, -0.25);
-  // second column factored
-  h_A1(0, 1) = T(4, 0);
-  h_A1(1, 1) = T(0, 10);
-  h_A1(2, 1) = T(0, -0.1);
-  // third column factored
-  h_A1(0, 2) = T(2, 0);
-  h_A1(1, 2) = T(4, 2);
-  h_A1(2, 2) = T(1.3, 0.9);
-}
+gt::gtensor<int, 1> A1_piv{2, 3, 3};
 
-template <typename T>
-void set_A1_piv(T&& h_p)
-{
-  h_p(0) = 2;
-  h_p(1) = 3;
-  h_p(2) = 3;
-}
+// A1_nopiv
+gt::gtensor<gt::complex<double>, 2> A1_nopiv{{1., 0., 0.},
+                                             {1., 2., 0.},
+                                             {0., 2., 3.}};
+// clang-format on
 
 template <typename T>
 void test_getrf_batch_real()
@@ -162,7 +62,7 @@ void test_getrf_batch_real()
 
   auto h_A0 = h_A.view(gt::all, gt::all, 0);
 
-  set_A0(h_A0);
+  h_A0 = A0;
   h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
 
   gt::copy(h_A, d_A);
@@ -235,9 +135,8 @@ void test_getrs_batch_real()
   gt::gtensor_device<gt::blas::index_t, 2> d_p(gt::shape(N, batch_size));
 
   // set up first (and only) batch
-  set_A0_LU(h_A.view(gt::all, gt::all, 0));
-  h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
-  set_A0_piv(h_p.view(gt::all, 0));
+  h_A.view(gt::all, gt::all, 0) = A0_LU;
+  h_p.view(gt::all, 0) = A0_piv;
 
   // set up two col vectors to solve in first (and only) batch
   // first RHS, col vector [11; 18; 28]
@@ -248,6 +147,8 @@ void test_getrs_batch_real()
   h_B(0, 1, 0) = 73;
   h_B(1, 1, 0) = 78;
   h_B(2, 1, 0) = 154;
+
+  h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
   h_Bptr(0) = gt::raw_pointer_cast(d_B.data());
 
   gt::copy(h_Aptr, d_Aptr);
@@ -301,13 +202,13 @@ void test_getrf_batch_complex()
   gt::gtensor_device<int, 1> d_info(batch_size);
 
   // setup first batch matrix input
-  set_A0(h_A.view(gt::all, gt::all, 0));
-  h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
+  h_A.view(gt::all, gt::all, 0) = A0;
 
   // setup second batch matrix input
-  set_A1_complex(h_A.view(gt::all, gt::all, 1));
+  h_A.view(gt::all, gt::all, 1) = A1;
   // TODO: better notation for this, i.e. the ability to get a pointer from a
   // view if it is wrapping a gcontainer or gtensor_span?
+  h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
   h_Aptr(1) = h_Aptr(0) + N * N;
 
   gt::copy(h_A, d_A);
@@ -396,7 +297,7 @@ void test_getrf_npvt_batch_complex()
   gt::gtensor_device<int, 1> d_info(batch_size);
 
   // setup first batch matrix input
-  set_A0_nopiv(h_A.view(gt::all, gt::all, 0));
+  h_A.view(gt::all, gt::all, 0) = A0_nopiv;
   h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
 
   // setup second batch matrix input
@@ -502,14 +403,15 @@ void test_getrs_batch_complex()
   test::gtensor2<gt::blas::index_t, 2, S> d_p(gt::shape(N, batch_size));
 
   // setup input for first batch
-  set_A0_LU(h_A.view(gt::all, gt::all, 0));
-  h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
-  set_A0_piv(h_p.view(gt::all, 0));
+  h_A.view(gt::all, gt::all, 0) = A0_LU;
+  h_p.view(gt::all, 0) = A0_piv;
 
   // setup input for second batch
-  set_A1_LU_complex(h_A.view(gt::all, gt::all, 1));
+  h_A.view(gt::all, gt::all, 1) = A1_LU;
+  h_p.view(gt::all, 1) = A1_piv;
+
+  h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
   h_Aptr[1] = h_Aptr(0) + N * N;
-  set_A1_piv(h_p.view(gt::all, 1));
 
   // first batch, first rhs col vector   (11; 18; 28)
   h_B(0, 0, 0) = 11;
@@ -608,14 +510,15 @@ void test_getri_batch_complex()
   gt::gtensor_device<int, 1> d_info(batch_size);
 
   // setup input for first batch
-  set_A0_LU(h_A.view(gt::all, gt::all, 0));
-  h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
-  set_A0_piv(h_p.view(gt::all, 0));
+  h_A.view(gt::all, gt::all, 0) = A0_LU;
+  h_p.view(gt::all, 0) = A0_piv;
 
   // setup input for second batch
-  set_A1_LU_complex(h_A.view(gt::all, gt::all, 1));
+  h_A.view(gt::all, gt::all, 1) = A1_LU;
+  h_p.view(gt::all, 1) = A1_piv;
+
+  h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
   h_Aptr[1] = h_Aptr(0) + N * N;
-  set_A1_piv(h_p.view(gt::all, 1));
 
   h_Cptr(0) = gt::raw_pointer_cast(d_C.data());
   h_Cptr(1) = h_Cptr(0) + N * N;

From 0f4e906e9b18554fc86035c631f0d65d51f21a14 Mon Sep 17 00:00:00 2001
From: Kai Germaschewski <kai.germaschewski@unh.edu>
Date: Tue, 27 Sep 2022 13:32:24 -0400
Subject: [PATCH 2/3] tests/lapack: make tests more concise

They are still kinda lengthy compared to what they do -- we should
probably have a better way to deal with the host/device mirroring.

And generally, there's still more room for improvement, I don't think
I made all the best choices here...
---
 tests/test_lapack.cxx | 175 ++++++++++--------------------------------
 1 file changed, 42 insertions(+), 133 deletions(-)

diff --git a/tests/test_lapack.cxx b/tests/test_lapack.cxx
index 0f68023b..1c56ef2d 100644
--- a/tests/test_lapack.cxx
+++ b/tests/test_lapack.cxx
@@ -21,7 +21,10 @@ gt::gtensor<double, 2> A0_LU{{4.0, 1.0, 0.25},
                              {4.0, 2.0, 0.5 },
                              {2.0, 2.0, 0.5 }};
 
-gt::gtensor<int, 1> A0_piv{2, 3, 3};
+// Math notation is 2, 3, 1, but BLAS uses procedural notation, i.e.
+// on the thirst step no swap is done so one-based index of third row
+// is still 3 (no swapping).
+gt::gtensor<gt::blas::index_t, 1> A0_piv{2, 3, 3};
 
 // A0_nopiv = [1 0 0; 1 2 0; 0 2 3];
 gt::gtensor<double, 2> A0_nopiv{{1, 0, 0},
@@ -37,7 +40,7 @@ gt::gtensor<gt::complex<double>, 2> A1_LU{{4.i,    - 1.i, 0.25 - 0.25i},
                                           {4. ,     10.i,      - 0.10i},
                                           {2. , 4. + 2.i, 1.30 + 0.90i}};
 
-gt::gtensor<int, 1> A1_piv{2, 3, 3};
+gt::gtensor<gt::blas::index_t, 1> A1_piv{2, 3, 3};
 
 // A1_nopiv
 gt::gtensor<gt::complex<double>, 2> A1_nopiv{{1., 0., 0.},
@@ -60,9 +63,7 @@ void test_getrf_batch_real()
   gt::gtensor<int, 1> h_info(batch_size);
   gt::gtensor_device<int, 1> d_info(batch_size);
 
-  auto h_A0 = h_A.view(gt::all, gt::all, 0);
-
-  h_A0 = A0;
+  h_A.view(gt::all, gt::all, 0) = A0;
   h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
 
   gt::copy(h_A, d_A);
@@ -78,30 +79,11 @@ void test_getrf_batch_real()
   gt::copy(d_p, h_p);
   gt::copy(d_info, h_info);
 
-  // first column factored
-  EXPECT_EQ(h_A0(0, 0), 4.0);
-  EXPECT_EQ(h_A0(1, 0), 1.0);
-  EXPECT_EQ(h_A0(2, 0), 0.25);
-  // second column factored
-  EXPECT_EQ(h_A0(0, 1), 4.0);
-  EXPECT_EQ(h_A0(1, 1), 2.0);
-  EXPECT_EQ(h_A0(2, 1), 0.5);
-  // third column factored
-  EXPECT_EQ(h_A0(0, 2), 2.0);
-  EXPECT_EQ(h_A0(1, 2), 2.0);
-  EXPECT_EQ(h_A0(2, 2), 0.5);
-
-  // Math notation is 2, 3, 1, but BLAS uses procedural notation, i.e.
-  // on the thirst step no swap is done so one-based index of third row
-  // is still 3 (no swapping).
-  EXPECT_EQ(h_p(0, 0), 2);
-  EXPECT_EQ(h_p(1, 0), 3);
-  EXPECT_EQ(h_p(2, 0), 3);
+  auto h_A0 = h_A.view(gt::all, gt::all, 0);
+  EXPECT_NEAR(gt::sum_squares(h_A0 - A0_LU), 0., 1e-14);
+  EXPECT_EQ(h_p.view(gt::all, 0), A0_piv);
 
-  for (int b = 0; b < batch_size; b++) {
-    // A_i factored successfully
-    EXPECT_EQ(h_info(b), 0);
-  }
+  EXPECT_EQ(h_info, gt::zeros<int>({batch_size}));
 }
 
 TEST(lapack, sgetrf_batch)
@@ -140,13 +122,9 @@ void test_getrs_batch_real()
 
   // set up two col vectors to solve in first (and only) batch
   // first RHS, col vector [11; 18; 28]
-  h_B(0, 0, 0) = 11;
-  h_B(1, 0, 0) = 18;
-  h_B(2, 0, 0) = 28;
+  h_B.view(gt::all, 0, 0) = gt::gtensor<T, 1>{11., 18., 28.};
   // second RHS, col vector [73; 78; 154]
-  h_B(0, 1, 0) = 73;
-  h_B(1, 1, 0) = 78;
-  h_B(2, 1, 0) = 154;
+  h_B.view(gt::all, 1, 0) = gt::gtensor<T, 1>{73., 78., 154.};
 
   h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
   h_Bptr(0) = gt::raw_pointer_cast(d_B.data());
@@ -166,13 +144,9 @@ void test_getrs_batch_real()
   gt::copy(d_B, h_B);
 
   // solution vector [1; 2; 3]
-  EXPECT_EQ(h_B(0, 0, 0), 1.0);
-  EXPECT_EQ(h_B(1, 0, 0), 2.0);
-  EXPECT_EQ(h_B(2, 0, 0), 3.0);
+  EXPECT_EQ(h_B.view(gt::all, 0, 0), (gt::gtensor<T, 1>{1., 2., 3.}));
   // solution vector [-3; 7; 31]
-  EXPECT_EQ(h_B(0, 1, 0), -3.0);
-  EXPECT_EQ(h_B(1, 1, 0), 7.0);
-  EXPECT_EQ(h_B(2, 1, 0), 31.0);
+  EXPECT_EQ(h_B.view(gt::all, 1, 0), (gt::gtensor<T, 1>{-3., 7., 31.}));
 }
 
 TEST(lapack, sgetrs_batch)
@@ -225,51 +199,16 @@ void test_getrf_batch_complex()
   gt::copy(d_info, h_info);
 
   // first batch matrix result
-  // first column factored
-  expect_complex_near(h_A(0, 0, 0), 4.0);
-  expect_complex_near(h_A(1, 0, 0), 1.0);
-  expect_complex_near(h_A(2, 0, 0), 0.25);
-  // second column factored
-  expect_complex_near(h_A(0, 1, 0), 4.0);
-  expect_complex_near(h_A(1, 1, 0), 2.0);
-  expect_complex_near(h_A(2, 1, 0), 0.5);
-  // third column factored
-  expect_complex_near(h_A(0, 2, 0), 2.0);
-  expect_complex_near(h_A(1, 2, 0), 2.0);
-  expect_complex_near(h_A(2, 2, 0), 0.5);
-
-  // Math notation is 2, 3, 1, but BLAS uses procedural notation, i.e.
-  // on the thirst step no swap is done so one-based index of third row
-  // is still 3 (no swapping).
-  EXPECT_EQ(h_p(0, 0), 2);
-  EXPECT_EQ(h_p(1, 0), 3);
-  EXPECT_EQ(h_p(2, 0), 3);
+  gt::gtensor<T, 2> h_A0 = h_A.view(gt::all, gt::all, 0);
+  EXPECT_NEAR(gt::sum_squares(h_A0 - gt::gtensor<T, 2>(A0_LU)), 0., 1e-14);
+  EXPECT_EQ(h_p.view(gt::all, 0), A0_piv);
 
   // second batch matrix result
-  // first column factored
-  expect_complex_near(h_A(0, 0, 1), T(0, 4));
-  expect_complex_near(h_A(1, 0, 1), T(0, -1));
-  expect_complex_near(h_A(2, 0, 1), T(0.25, -0.25));
-  // second column factored
-  expect_complex_near(h_A(0, 1, 1), T(4, 0));
-  expect_complex_near(h_A(1, 1, 1), T(0, 10));
-  expect_complex_near(h_A(2, 1, 1), T(0, -0.1));
-  // third column factored
-  expect_complex_near(h_A(0, 2, 1), T(2, 0));
-  expect_complex_near(h_A(1, 2, 1), T(4, 2));
-  expect_complex_near(h_A(2, 2, 1), T(1.3, 0.9));
+  gt::gtensor<T, 2> h_A1 = h_A.view(gt::all, gt::all, 1);
+  EXPECT_NEAR(gt::sum_squares(h_A1 - gt::gtensor<T, 2>(A1_LU)), 0., 1e-14);
+  EXPECT_EQ(h_p.view(gt::all, 1), A1_piv);
 
-  // Math notation is 2, 3, 1, but BLAS uses procedural notation, i.e.
-  // on the thirst step no swap is done so one-based index of third row
-  // is still 3 (no swapping).
-  EXPECT_EQ(h_p(0, 1), 2);
-  EXPECT_EQ(h_p(1, 1), 3);
-  EXPECT_EQ(h_p(2, 1), 3);
-
-  for (int b = 0; b < batch_size; b++) {
-    // A_i factored successfully
-    EXPECT_EQ(h_info(b), 0);
-  }
+  EXPECT_EQ(h_info, gt::zeros<int>({batch_size}));
 }
 
 TEST(lapack, cgetrf_batch)
@@ -298,12 +237,12 @@ void test_getrf_npvt_batch_complex()
 
   // setup first batch matrix input
   h_A.view(gt::all, gt::all, 0) = A0_nopiv;
-  h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
-
   // setup second batch matrix input
-  set_A1_complex_nopiv(h_A.view(gt::all, gt::all, 1));
+  h_A.view(gt::all, gt::all, 1) = A1_nopiv;
+
   // TODO: better notation for this, i.e. the ability to get a pointer from a
   // view if it is wrapping a gcontainer or gtensor_span?
+  h_Aptr(0) = gt::raw_pointer_cast(d_A.data());
   h_Aptr(1) = h_Aptr(0) + N * N;
 
   gt::copy(h_A, d_A);
@@ -414,21 +353,15 @@ void test_getrs_batch_complex()
   h_Aptr[1] = h_Aptr(0) + N * N;
 
   // first batch, first rhs col vector   (11; 18; 28)
-  h_B(0, 0, 0) = 11;
-  h_B(1, 0, 0) = 18;
-  h_B(2, 0, 0) = 28;
+  h_B.view(gt::all, 0, 0) = gt::gtensor<T, 1>{11., 18., 28.};
   // first batch, second rhs col vector  (73; 78; 154)
-  h_B(0, 1, 0) = 73;
-  h_B(1, 1, 0) = 78;
-  h_B(2, 1, 0) = 154;
+  h_B.view(gt::all, 1, 0) = gt::gtensor<T, 1>{73., 78., 154.};
   // second batch, first rhs col vector  (73; 78; 154)
-  h_B(0, 0, 1) = T(11, -1);
-  h_B(1, 0, 1) = T(14, 4);
-  h_B(2, 0, 1) = T(16, 12);
+  h_B.view(gt::all, 0, 1) =
+    gt::gtensor<T, 1>{T(11., -1.), T(14., 4.), T(16., 12.)};
   // second batch, second rhs col vector (73-10i; 90-12i; 112 + 42i)
-  h_B(0, 1, 1) = T(73, -10);
-  h_B(1, 1, 1) = T(90, -12);
-  h_B(2, 1, 1) = T(112, 42);
+  h_B.view(gt::all, 1, 1) =
+    gt::gtensor<T, 1>{T(73., -10.), T(90., -12.), T(112., 42.)};
 
   h_Bptr(0) = gt::raw_pointer_cast(d_B.data());
   h_Bptr(1) = h_Bptr(0) + N * NRHS;
@@ -449,21 +382,12 @@ void test_getrs_batch_complex()
   gt::copy(d_B, h_B);
 
   // first batch, first solution vector [1; 2; 3]
-  expect_complex_near(h_B(0, 0, 0), 1.0);
-  expect_complex_near(h_B(1, 0, 0), 2.0);
-  expect_complex_near(h_B(2, 0, 0), 3.0);
   // first batch, second solution vector [-3; 7; 31]
-  expect_complex_near(h_B(0, 1, 0), -3.0);
-  expect_complex_near(h_B(1, 1, 0), 7.0);
-  expect_complex_near(h_B(2, 1, 0), 31.0);
   // second batch, first solution vector [1; 2; 3]
-  expect_complex_near(h_B(0, 0, 1), 1.0);
-  expect_complex_near(h_B(1, 0, 1), 2.0);
-  expect_complex_near(h_B(2, 0, 1), 3.0);
   // second batch, second solution vector [-3; 7; 31]
-  expect_complex_near(h_B(0, 1, 1), -3.0);
-  expect_complex_near(h_B(1, 1, 1), 7.0);
-  expect_complex_near(h_B(2, 1, 1), 31.0);
+  auto expected_B = gt::gtensor<T, 3>(
+    {{{1., 2., 3.}, {-3., 7., 31.}}, {{1., 2., 3.}, {-3., 7., 31.}}});
+  EXPECT_NEAR(gt::sum_squares(h_B - expected_B), 0., 1e-10);
 }
 
 TEST(lapack, cgetrs_batch)
@@ -539,30 +463,15 @@ void test_getri_batch_complex()
 
   gt::copy(d_C, h_C);
 
-  // first batch, first solution col [1; -2; 2]
-  expect_complex_near(h_C(0, 0, 0), 1.0);
-  expect_complex_near(h_C(1, 0, 0), -2.0);
-  expect_complex_near(h_C(2, 0, 0), 2.0);
-  // first batch, second solution col [1; -1; 0.5]
-  expect_complex_near(h_C(0, 1, 0), 1.0);
-  expect_complex_near(h_C(1, 1, 0), -1.0);
-  expect_complex_near(h_C(2, 1, 0), 0.5);
-  // first batch, third solution col [-1; 1.5; -1]
-  expect_complex_near(h_C(0, 2, 0), -1.0);
-  expect_complex_near(h_C(1, 2, 0), 1.5);
-  expect_complex_near(h_C(2, 2, 0), -1.0);
-  // second batch, first solution col
-  expect_complex_near(h_C(0, 0, 1), T(-0.1, 0.3));
-  expect_complex_near(h_C(1, 0, 1), T(0.04, 0.28));
-  expect_complex_near(h_C(2, 0, 1), T(0.52, -0.36));
-  // second batch, second solution col
-  expect_complex_near(h_C(0, 1, 1), T(-0.04, -0.28));
-  expect_complex_near(h_C(1, 1, 1), T(0.016, -0.088));
-  expect_complex_near(h_C(2, 1, 1), T(-0.092, 0.256));
-  // second batch, third solution col
-  expect_complex_near(h_C(0, 2, 1), T(0.07, -0.01));
-  expect_complex_near(h_C(1, 2, 1), T(-0.028, -0.096));
-  expect_complex_near(h_C(2, 2, 1), T(0.036, 0.052));
+  // clang-format off
+  auto expected_C =
+    gt::gtensor<T, 3>({{{ 1.0, -2.0,  2.0},
+                        { 1.0, -1.0,   .5},
+                        {-1.0,  1.5, -1.0}},
+                       {{T(-0.1 ,  0.3 ), T( 0.04 ,  0.28 ), T( 0.52 , -0.36 )},
+                        {T(-0.04, -0.28), T( 0.016, -0.088), T(-0.092,  0.256)},
+                        {T( 0.07, -0.01), T(-0.028, -0.096), T( 0.036,  0.052)}}});
+  // clang-format on
 }
 
 TEST(lapack, cgetri_batch)

From 1bf04a349b7c62288e2891caa1fde34d52a9798c Mon Sep 17 00:00:00 2001
From: Kai Germaschewski <kai.germaschewski@unh.edu>
Date: Tue, 27 Sep 2022 22:25:50 -0400
Subject: [PATCH 3/3] test/lapack: use gt::launch for checking against expected

---
 tests/test_lapack.cxx | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/tests/test_lapack.cxx b/tests/test_lapack.cxx
index 1c56ef2d..6e9d8625 100644
--- a/tests/test_lapack.cxx
+++ b/tests/test_lapack.cxx
@@ -79,8 +79,11 @@ void test_getrf_batch_real()
   gt::copy(d_p, h_p);
   gt::copy(d_info, h_info);
 
-  auto h_A0 = h_A.view(gt::all, gt::all, 0);
-  EXPECT_NEAR(gt::sum_squares(h_A0 - A0_LU), 0., 1e-14);
+  gt::launch_host<2>(
+    A0_LU.shape(), GT_LAMBDA(int i, int j) {
+      EXPECT_NEAR(h_A(i, j, 0), A0_LU(i, j), 1e-14)
+        << "i = " << i << " j = " << j;
+    });
   EXPECT_EQ(h_p.view(gt::all, 0), A0_piv);
 
   EXPECT_EQ(h_info, gt::zeros<int>({batch_size}));
@@ -199,13 +202,17 @@ void test_getrf_batch_complex()
   gt::copy(d_info, h_info);
 
   // first batch matrix result
-  gt::gtensor<T, 2> h_A0 = h_A.view(gt::all, gt::all, 0);
-  EXPECT_NEAR(gt::sum_squares(h_A0 - gt::gtensor<T, 2>(A0_LU)), 0., 1e-14);
+  gt::launch_host<2>(
+    A0_LU.shape(), GT_LAMBDA(int i, int j) {
+      expect_complex_near(h_A(i, j, 0), T(A0_LU(i, j)), 1e-6);
+    });
   EXPECT_EQ(h_p.view(gt::all, 0), A0_piv);
 
   // second batch matrix result
-  gt::gtensor<T, 2> h_A1 = h_A.view(gt::all, gt::all, 1);
-  EXPECT_NEAR(gt::sum_squares(h_A1 - gt::gtensor<T, 2>(A1_LU)), 0., 1e-14);
+  gt::launch_host<2>(
+    A0_LU.shape(), GT_LAMBDA(int i, int j) {
+      expect_complex_near(h_A(i, j, 1), T(A1_LU(i, j)), 1e-6);
+    });
   EXPECT_EQ(h_p.view(gt::all, 1), A1_piv);
 
   EXPECT_EQ(h_info, gt::zeros<int>({batch_size}));
@@ -387,7 +394,10 @@ void test_getrs_batch_complex()
   // second batch, second solution vector [-3; 7; 31]
   auto expected_B = gt::gtensor<T, 3>(
     {{{1., 2., 3.}, {-3., 7., 31.}}, {{1., 2., 3.}, {-3., 7., 31.}}});
-  EXPECT_NEAR(gt::sum_squares(h_B - expected_B), 0., 1e-10);
+  gt::launch_host<3>(
+    h_B.shape(), GT_LAMBDA(int i, int j, int k) {
+      expect_complex_near(h_B(i, j, k), expected_B(i, j, k), 1e-6);
+    });
 }
 
 TEST(lapack, cgetrs_batch)