diff --git a/Makefile b/Makefile
index 3fcd41ae..87f39261 100644
--- a/Makefile
+++ b/Makefile
@@ -138,7 +138,7 @@ OBJS := $(OBJS) $(SRCDIR)/PETSc_Helper.o \
 		  $(SRCDIR)/PCPFLAREINV.o	
 
 # Define a variable containing all the tests
-export TEST_TARGETS = ex12f ex6f ex6f_getcoeffs ex6 adv_1d adv_diff_2d ex6_cf_splitting adv_diff_cg_supg matrandom matrandom_check_reset
+export TEST_TARGETS = ex12f ex6f ex6f_getcoeffs ex6 adv_1d adv_diff_2d ex6_cf_splitting adv_diff_cg_supg matrandom matrandom_check_reset ex12f_gmres_poly mat_diag
 # Include kokkos examples
 ifeq ($(PETSC_HAVE_KOKKOS),1)
 export TEST_TARGETS := $(TEST_TARGETS) adv_1dk
diff --git a/README.md b/README.md
index 13a5cff8..2719d0df 100644
--- a/README.md
+++ b/README.md
@@ -37,8 +37,8 @@ PCPFLAREINV contains methods for computing approximate inverses, most of which c
    | ------------- | -- | ------------- | -- |
    | power  |  PFLAREINV_POWER  | GMRES polynomial, applied as a mononomial, with coefficients computed with a power basis  | Yes |
    | arnoldi  |  PFLAREINV_ARNOLDI  | GMRES polynomial, applied as a mononomial, with coefficients computed with an Arnoldi method  | Yes |
-   | newton  |  PFLAREINV_NEWTON  | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with extra roots added for stability  | Yes |
-   | newton_no_extra  |  PFLAREINV_NEWTON_NO_EXTRA  | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with no extra roots added   | Yes |
+   | newton  |  PFLAREINV_NEWTON  | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with extra roots added for stability  | Matrix-free: Yes Assembled: No |
+   | newton_no_extra  |  PFLAREINV_NEWTON_NO_EXTRA  | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with no extra roots added   | Matrix-free: Yes Assembled: No |
    | neumann  |  PFLAREINV_NEUMANN  | Neumann polynomial  | Yes |
    | sai  |  PFLAREINV_SAI  | Sparse approximate inverse  | No |
    | isai  |  PFLAREINV_ISAI  | Incomplete sparse approximate inverse (equivalent to a one-level RAS)  | No |
diff --git a/dockerfiles/Dockerfile_intel b/dockerfiles/Dockerfile_intel
index ea51ed6b..b7a2ae3c 100644
--- a/dockerfiles/Dockerfile_intel
+++ b/dockerfiles/Dockerfile_intel
@@ -28,6 +28,9 @@ RUN source /opt/intel/oneapi/setvars.sh && \
     sed -i '/^run_tests_load_parallel:/,/^run_tests_no_load_serial:/s/-pc_pflareinv_type power/-pc_pflareinv_type arnoldi/g' tests/Makefile && \
     sed -i '/^run_tests_no_load_parallel:/,/^run_tests_medium_serial:/s/-pc_air_inverse_type power/-pc_air_inverse_type arnoldi/g' tests/Makefile && \
     sed -i '/^run_tests_no_load_parallel:/,/^run_tests_medium_serial:/s/-pc_pflareinv_type power/-pc_pflareinv_type arnoldi/g' tests/Makefile && \
+    echo "Disabling power basis in parallel test ex12f_gmres_poly in tests/Makefile" && \
+    sed -i '/^run_tests_load_parallel:/,/^run_tests_no_load_serial:/s/ex12f_gmres_poly/ex12f_gmres_poly -no_power/g' tests/Makefile && \
+    sed -i '/^run_tests_no_load_parallel:/,/^run_tests_medium_serial:/s/ex12f_gmres_poly/ex12f_gmres_poly -no_power/g' tests/Makefile && \    
     make -j2 && make -j2 check && \
     make -j2 tests
 
diff --git a/src/AIR_MG_Setup.F90 b/src/AIR_MG_Setup.F90
index 180d0f0d..5893e453 100644
--- a/src/AIR_MG_Setup.F90
+++ b/src/AIR_MG_Setup.F90
@@ -424,7 +424,7 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input)
          end if
 
          ! Convert Aff to a matdiagonal type
-         ! Haven't rewritten sai to take advantage of matdiagonal
+         ! Haven't rewritten some inverse types to take advantage of matdiagonal
          if (aff_diag .AND. &
                   inverse_type_aff /= PFLAREINV_SAI .AND. &
                   inverse_type_aff /= PFLAREINV_ISAI) then
diff --git a/src/Approx_Inverse_Setup.F90 b/src/Approx_Inverse_Setup.F90
index 35559cab..293f4e03 100644
--- a/src/Approx_Inverse_Setup.F90
+++ b/src/Approx_Inverse_Setup.F90
@@ -70,7 +70,12 @@ subroutine calculate_and_build_approximate_inverse(matrix, inverse_type, &
             allocate(coefficients(poly_order + 1, 1))
          end if
       else
-         coefficients => coefficients_stack
+         if (inverse_type == PFLAREINV_NEWTON .OR. inverse_type == PFLAREINV_NEWTON_NO_EXTRA) then
+            ! Newton basis needs storage for real and imaginary roots
+            allocate(coefficients(poly_order + 1, 2))
+         else         
+            coefficients => coefficients_stack
+         end if
       end if
 
       ! This is diabolical - In petsc 3.22, they changed the way to test for 
@@ -303,13 +308,9 @@ subroutine finish_approximate_inverse(matrix, inverse_type, &
       ! Gmres polynomial with newton basis
       else if (inverse_type == PFLAREINV_NEWTON .OR. inverse_type == PFLAREINV_NEWTON_NO_EXTRA) then
 
-         if (.NOT. matrix_free) then
-            print *, "GMRES polynomial with Newton basis must be applied matrix-free"
-            call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode)
-         end if
-
          call build_gmres_polynomial_newton_inverse(matrix, poly_order, &
                            coefficients, &
+                           inverse_sparsity_order, matrix_free, reuse_mat, reuse_submatrices, &
                            inv_matrix)         
 
       ! Neumann polynomial
diff --git a/src/Gmres_Poly.F90 b/src/Gmres_Poly.F90
index 552f77ac..309b42e3 100644
--- a/src/Gmres_Poly.F90
+++ b/src/Gmres_Poly.F90
@@ -851,7 +851,6 @@ subroutine mat_mult_powers_share_sparsity_cpu(matrix, poly_order, poly_sparsity_
 
       ! Compute matrix powers c = coeff(1) * I + coeff(2) * A + coeff(3) * A^2 + coeff(4) * A^3 + ... 
       ! where a c and the powers all share the same sparsity as the power input in poly_sparsity_order
-      ! Assuming cmat has not been built/allocated
       ! This also finishes the async comms required to compute the gmres poly coefficients if buffers%request is allocated
    
       ! ~~~~~~~~~~
@@ -1655,7 +1654,7 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity(matrix, poly_order,
       integer :: order
       PetscErrorCode :: ierr      
       logical :: reuse_triggered
-      type(tVec) :: rhs_copy, diag_vec, power_vec
+      type(tVec) :: inv_vec, diag_vec, power_vec
       ! ~~~~~~
 
       reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix)
@@ -1666,9 +1665,9 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity(matrix, poly_order,
 
       ! This stores D^order
       if (.NOT. reuse_triggered) then
-         call VecDuplicate(diag_vec, rhs_copy, ierr)
+         call VecDuplicate(diag_vec, inv_vec, ierr)
       else
-         call MatDiagonalGetDiagonal(inv_matrix, rhs_copy, ierr)
+         call MatDiagonalGetDiagonal(inv_matrix, inv_vec, ierr)
       end if
       call VecCopy(diag_vec, power_vec, ierr)
 
@@ -1678,22 +1677,22 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity(matrix, poly_order,
       call finish_gmres_polynomial_coefficients_power(poly_order, buffers, coefficients)         
 
       ! Set: alpha_0 * I
-      call VecSet(rhs_copy, coefficients(1), ierr)
+      call VecSet(inv_vec, coefficients(1), ierr)
 
       ! Calculate: alpha_0 * I + alpha_1 * D + alpha_2 * D^2
       do order = 1, poly_order
-         call VecAXPY(rhs_copy, coefficients(order+1), power_vec, ierr)
+         call VecAXPY(inv_vec, coefficients(order+1), power_vec, ierr)
          ! Compute power_vec = power_vec * D
          if (order /= poly_order) call VecPointwiseMult(power_vec, power_vec, diag_vec, ierr)
       end do
 
       ! We may be reusing with the same sparsity
       if (.NOT. reuse_triggered) then
-         ! The matrix takes ownership of rhs_copy and increases ref counter
-         call MatCreateDiagonal(rhs_copy, inv_matrix, ierr)
-         call VecDestroy(rhs_copy, ierr)
+         ! The matrix takes ownership of inv_vec and increases ref counter
+         call MatCreateDiagonal(inv_vec, inv_matrix, ierr)
+         call VecDestroy(inv_vec, ierr)
       else
-         call MatDiagonalRestoreDiagonal(inv_matrix, rhs_copy, ierr)
+         call MatDiagonalRestoreDiagonal(inv_matrix, inv_vec, ierr)
       end if  
 
       call VecDestroy(diag_vec, ierr)
diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90
index 87e5f8da..1761c901 100644
--- a/src/Gmres_Poly_Newton.F90
+++ b/src/Gmres_Poly_Newton.F90
@@ -135,6 +135,240 @@ subroutine modified_leja(real_roots, imag_roots, indices)
 
    end subroutine modified_leja   
 
+   ! -------------------------------------------------------------------------------------------------
+
+   subroutine cluster_eigenvalues_stable(real_roots, imag_roots, rel_tol, abs_tol)
+
+      ! Clustering of (possibly complex) harmonic Ritz values.
+      ! Numerically distinct clusters are moved to the front.
+      ! Remaining entries are set to zero.
+      ! Skips eigenvalues that are exactly zero (both real and imag parts).
+      !
+      ! Inputs:
+      !   real_roots, imag_roots : eigenvalues (length k)
+      !   rel_tol               : relative tolerance (suggest sqrt(eps) ~ 1e-8)
+      !   abs_tol               : absolute tolerance (suggest eps * ||H|| ~ 1e-15)
+      !
+      ! Outputs:
+      !   real_roots, imag_roots : clustered eigenvalues at front, zeros after
+
+      PetscReal, dimension(:), intent(inout) :: real_roots, imag_roots
+      PetscReal, intent(in)                  :: rel_tol, abs_tol
+      integer                                :: i, j, n, n_unique, cluster_size
+      logical, allocatable                   :: used(:)
+      PetscReal                              :: dist, mag_i, mag_j, scale
+      PetscReal                              :: sum_real, sum_imag
+      PetscReal, allocatable                 :: rtmp(:), itmp(:)
+
+      n = size(real_roots)
+
+      allocate(used(n))
+      allocate(rtmp(n), itmp(n))
+
+      used = .false.
+      n_unique = 0
+
+      ! ---------------------------------------------------------
+      ! All-pairs clustering
+      ! ---------------------------------------------------------
+      do i = 1, n
+
+         if (used(i)) cycle
+
+         ! Skip eigenvalues that are exactly zero
+         if (real_roots(i) == 0.0d0 .AND. imag_roots(i) == 0.0d0) then
+            used(i) = .true.
+            cycle
+         end if
+
+         ! Start new cluster with eigenvalue i
+         sum_real = real_roots(i)
+         sum_imag = imag_roots(i)
+         cluster_size = 1
+         used(i) = .true.
+         
+         mag_i = sqrt(real_roots(i)**2 + imag_roots(i)**2)
+
+         ! Look for all other eigenvalues close to this one
+         do j = i + 1, n
+
+            if (used(j)) cycle
+
+            ! Skip exactly zero eigenvalues
+            if (real_roots(j) == 0.0d0 .AND. imag_roots(j) == 0.0d0) then
+               used(j) = .true.
+               cycle
+            end if
+
+            mag_j = sqrt(real_roots(j)**2 + imag_roots(j)**2)
+            
+            ! Distance between eigenvalues
+            dist = sqrt((real_roots(j) - real_roots(i))**2 + &
+                        (imag_roots(j) - imag_roots(i))**2)
+
+            ! Use the larger magnitude for relative scaling
+            scale = max(mag_i, mag_j, 1.0d0)
+
+            ! Check if within tolerance
+            if (dist <= abs_tol + rel_tol * scale) then
+               sum_real = sum_real + real_roots(j)
+               sum_imag = sum_imag + imag_roots(j)
+               cluster_size = cluster_size + 1
+               used(j) = .true.
+            end if
+
+         end do
+
+         ! Compute cluster centroid (mean)
+         n_unique = n_unique + 1
+         rtmp(n_unique) = sum_real / dble(cluster_size)
+         itmp(n_unique) = sum_imag / dble(cluster_size)
+
+      end do
+
+      ! ---------------------------------------------------------
+      ! Output compact form
+      ! ---------------------------------------------------------
+      real_roots = 0.0d0
+      imag_roots = 0.0d0
+
+      real_roots(1:n_unique) = rtmp(1:n_unique)
+      imag_roots(1:n_unique) = itmp(1:n_unique)
+
+      deallocate(used, rtmp, itmp)
+
+   end subroutine cluster_eigenvalues_stable
+
+
+   ! -------------------------------------------------------------------------------------------------
+
+   subroutine compute_extra_roots(real_roots, imag_roots, real_roots_output, imag_roots_output)
+
+      ! Add extra roots for stability
+      ! Computes the product of factors for each eigenvalue and adds extra copies
+      ! of roots that have large products (to improve polynomial stability)
+      ! Only non-zero eigenvalues should be passed in
+      ! real_roots_output, imag_roots_output are allocated and filled with the original
+      ! roots plus any extra
+
+      ! ~~~~~~
+      PetscReal, dimension(:), intent(inout)              :: real_roots, imag_roots
+      PetscReal, dimension(:), allocatable, intent(inout) :: real_roots_output, imag_roots_output
+
+      ! Local variables
+      integer :: i_loc, j_loc, k_loc, n_roots, total_extra, counter
+      PetscReal :: a, b, c, d, div_real, div_imag, div_mag
+      PetscReal, dimension(size(real_roots)) :: pof
+      integer, dimension(size(real_roots)) :: extra_pair_roots, overflow
+
+      ! ~~~~~~
+
+      n_roots = size(real_roots)
+
+      ! Compute the product of factors
+      pof = 1   
+      extra_pair_roots = 0
+      overflow = 0
+      total_extra = 0
+      do k_loc = 1, n_roots
+
+         a = real_roots(k_loc)
+         b = imag_roots(k_loc)      
+         
+         ! We have already computed pof for the positive imaginary complex conjugate
+         if (b < 0) cycle
+
+         ! Skips eigenvalues that are numerically zero
+         if (abs(a) < 1e-12) cycle
+         if (a**2 + b**2 < 1e-12) cycle
+
+         ! Compute product(k)_{i, j/=i} * | 1 - theta_j/theta_i|
+         do i_loc = 1, n_roots
+
+            ! Skip
+            if (k_loc == i_loc) cycle
+
+            c = real_roots(i_loc)
+            d = imag_roots(i_loc)
+
+            ! Skips eigenvalues that are numerically zero
+            if (abs(c) < 1e-12) cycle
+            if (c**2 + d**2 < 1e-12) cycle
+
+            ! theta_k/theta_i
+            div_real = (a * c + b * d)/(c**2 + d**2)
+            div_imag = (b * c - a * d)/(c**2 + d**2)
+
+            ! |1 - theta_k/theta_i|
+            div_mag = sqrt((1 - div_real)**2 + div_imag**2)
+
+            ! Pof is about to overflow, store the exponent and 
+            ! reset pof back to one
+            ! We can hit this for very high order polynomials, where we have to 
+            ! add more roots than 22 (ie pof > 1e308)
+            if (log10(pof(k_loc)) + log10(div_mag) > 307) then
+               overflow(k_loc) = overflow(k_loc) + int(log10(pof(k_loc)))
+               pof(k_loc) = 1
+            end if            
+
+            ! Product
+            pof(k_loc) = pof(k_loc) * div_mag
+
+         end do
+
+         ! If pof > 1e4, we add an extra root, plus one extra for every 1e14
+         if (log10(pof(k_loc)) > 4 .OR. overflow(k_loc) /= 0) then
+
+            ! if real extra_pair_roots counts each distinct real root we're adding
+            ! if imaginary it only counts a pair as one
+            extra_pair_roots(k_loc) = ceiling((log10(pof(k_loc)) + overflow(k_loc) - 4.0)/14.0)
+            total_extra = total_extra + extra_pair_roots(k_loc)
+
+            ! If imaginary, the pof is the same for the conjugate, let's just set it to -1
+            if (b > 0) then
+               ! We know the positive imaginary value is first, so the conjugate follows it
+               pof(k_loc+1) = -1
+               ! We need the conjugates as well
+               total_extra = total_extra + extra_pair_roots(k_loc)
+
+            end if            
+         end if
+      end do
+
+      ! Allocate output arrays (original roots + extra roots)
+      allocate(real_roots_output(n_roots + total_extra))
+      allocate(imag_roots_output(n_roots + total_extra))
+      real_roots_output = 0d0
+      imag_roots_output = 0d0
+
+      ! Copy in original roots
+      real_roots_output(1:n_roots) = real_roots(1:n_roots)
+      imag_roots_output(1:n_roots) = imag_roots(1:n_roots)
+
+      ! Add the extra copies of roots, ensuring conjugate pairs we add 
+      ! are next to each other
+      counter = n_roots + 1
+      do i_loc = 1, n_roots
+
+         ! For each extra root pair to add
+         do j_loc = 1, extra_pair_roots(i_loc)
+
+            real_roots_output(counter) = real_roots(i_loc)
+            imag_roots_output(counter) = imag_roots(i_loc)
+            ! Add in the conjugate
+            if (imag_roots(i_loc) > 0) then
+               real_roots_output(counter+1) = real_roots(i_loc)
+               imag_roots_output(counter+1) = -imag_roots(i_loc)
+            end if
+
+            counter = counter + 1
+            if (imag_roots(i_loc) > 0) counter = counter + 1
+         end do
+      end do
+
+   end subroutine compute_extra_roots   
+
+
 ! -------------------------------------------------------------------------------------------------------------------------------
 
    subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots, coefficients)
@@ -160,22 +394,22 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots
       ! Local variables
       PetscInt :: global_rows, global_cols, local_rows, local_cols
       integer :: lwork, subspace_size, rank, i_loc, comm_size, comm_rank, errorcode, iwork_size, j_loc
-      integer :: total_extra, counter, k_loc, m
+      integer :: total_extra, counter, k_loc, m, numerical_order
       PetscErrorCode :: ierr      
       MPIU_Comm :: MPI_COMM_MATRIX
       PetscReal, dimension(poly_order+2,poly_order+1) :: H_n
       PetscReal, dimension(poly_order+1,poly_order+2) :: H_n_T
-      PetscReal, dimension(poly_order+1) :: e_d, solution, s, pof
-      integer, dimension(poly_order+1) :: extra_pair_roots, overflow
+      PetscReal, dimension(poly_order+1) :: e_d, solution, s
       integer, dimension(:), allocatable :: iwork_allocated, indices
-      PetscReal, dimension(:), allocatable :: work
+      PetscReal, dimension(:), allocatable :: work, real_roots_added, imag_roots_added
+      PetscReal, dimension(:), allocatable :: perturbed_real, perturbed_imag
       PetscReal, dimension(:,:), allocatable :: VL, VR
-      PetscReal :: beta, div_real, div_imag, a, b, c, d, div_mag
+      PetscReal :: beta
       PetscReal, dimension(:, :), allocatable :: coefficients_temp
       type(tVec) :: w_j
       type(tVec), dimension(poly_order+2) :: V_n
       logical :: use_harmonic_ritz = .TRUE.
-      PetscReal :: rcond = 1e-12
+      PetscReal :: rcond = 1e-12, rel_tol, abs_tol, H_norm
 
       ! ~~~~~~    
 
@@ -236,8 +470,9 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots
          ! What we find is that when use this to compute eigenvalues we find e-vals 
          ! as we might expect up to the rank
          ! but then we have some eigenvalues that are numerically zero
-         ! We keep those and our application of the newton polynomial in 
-         ! petsc_matvec_gmres_newton_mf and petsc_matvec_gmres_newton_mf_residual
+         ! Given the way the outside code is structured, we can't lower the poly_order
+         ! in this routine and return
+         ! Instead we keep the "zero" eigenvalues our application of the newton polynomial
          ! just skips them and hence we don't do any 
          ! extra work in the application phase than we would have done with lower order     
          
@@ -264,8 +499,8 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots
          call dgelsd(poly_order + 1, poly_order + 1, 1, H_n_T, size(H_n_T, 1), &
                         e_d, size(e_d), s, rcond, rank, &
                         work, lwork, iwork_allocated, errorcode)
-         deallocate(work, iwork_allocated)         
-
+         deallocate(work, iwork_allocated)        
+         
          ! Copy in the solution
          solution = e_d
 
@@ -310,144 +545,146 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots
       end if  
 
       ! ~~~~~~~~~~~~~~
-      ! Add roots for stability
-      ! ~~~~~~~~~~~~~~         
-      if (add_roots) then 
-
-         ! Compute the product of factors
-         pof = 1   
-         extra_pair_roots = 0
-         overflow = 0
-         total_extra = 0
-         do k_loc = 1, poly_order + 1
-
-            a = coefficients(k_loc, 1)
-            b = coefficients(k_loc, 2)      
-            
-            ! We have already computed pof for the positive imaginary complex conjugate
-            if (b < 0) cycle
-
-            ! Skips eigenvalues that are numerically zero
-            if (abs(a) < 1e-12) cycle
-            if (a**2 + b**2 < 1e-12) cycle
-
-            ! Compute product(k)_{i, j/=i} * | 1 - theta_j/theta_i|
-            do i_loc = 1, poly_order + 1
-
-               ! Skip
-               if (k_loc == i_loc) cycle
-
-               c = coefficients(i_loc, 1)
-               d = coefficients(i_loc, 2)
+      ! Now we have to check the output eigenvalues
+      ! ~~~~~~~~~~~~~~
 
-               ! Skips eigenvalues that are numerically zero
-               if (abs(c) < 1e-12) cycle
-               if (c**2 + d**2 < 1e-12) cycle
+      ! These are the tolerances that control the clustering
+      H_norm = norm2(H_n(1:m,1:m))
+      rel_tol = 1.0d0 * sqrt(epsilon(1.0d0))
+      abs_tol = epsilon(1.0d0) * max(H_norm, beta)
+
+      ! In some cases with numerical rank deficiency, we can still
+      ! end up with non-zero (or negative) eigenvalues that
+      ! are trivially small - we set them explicitly to zero
+      do i_loc = 1, poly_order + 1
+         if (coefficients(i_loc,1)**2 + coefficients(i_loc,2)**2 < &
+            (abs_tol + rel_tol*H_norm)**2) then
+            coefficients(i_loc, 1) = 0d0
+            coefficients(i_loc, 2) = 0d0
+         end if
+      end do       
+           
+      ! ~~~~~~~~~~~~~~
+      ! Cluster close eigenvalues together to improve stability of the polynomial evaluation
+      ! For example when computing the e'vals of a constant diagonal matrix
+      ! the rank revealing factorisation above doesn't always report a rank of 1 given roundoff
+      ! Instead it returns multiple eigenvalues that are very close to each other, 
+      ! and we want to cluster those together and treat them as one root
+      ! ~~~~~~~~~~~~~~      
 
-               ! theta_k/theta_i
-               div_real = (a * c + b * d)/(c**2 + d**2)
-               div_imag = (b * c - a * d)/(c**2 + d**2)
+      ! This places all exactly zero eigenvalues at the end of coefficients
+      call cluster_eigenvalues_stable(coefficients(:, 1), coefficients(:, 2), rel_tol, abs_tol)      
 
-               ! |1 - theta_k/theta_i|
-               div_mag = sqrt((1 - div_real)**2 + div_imag**2)
+      ! ~~~~~~~~~~~~~~
+      ! Extract the non-zero eigenvalues for root adding and leja ordering
+      ! Zero eigenvalues will be appended at the end
+      ! ~~~~~~~~~~~~~~
+      ! Count the number of non-zero eigenvalues after clustering
+      numerical_order = 0
+      do i_loc = 1, poly_order + 1
+         if (coefficients(i_loc, 1) /= 0d0 .OR. coefficients(i_loc, 2) /= 0d0) then
+            numerical_order = numerical_order + 1
+         end if
+      end do
 
-               ! Pof is about to overflow, store the exponent and 
-               ! reset pof back to one
-               ! We can hit this for very high order polynomials, where we have to 
-               ! add more roots than 22 (ie pof > 1e308)
-               if (log10(pof(k_loc)) + log10(div_mag) > 307) then
-                  overflow(k_loc) = overflow(k_loc) + int(log10(pof(k_loc)))
-                  pof(k_loc) = 1
-               end if            
+      ! ~~~~~~~~~~~~~~
+      ! Add roots for stability (only on non-zero eigenvalues)
+      ! ~~~~~~~~~~~~~~         
+      if (add_roots .AND. numerical_order > 0) then 
 
-               ! Product
-               pof(k_loc) = pof(k_loc) * div_mag
+         ! Extract non-zero eigenvalues into a temporary array
+         allocate(coefficients_temp(numerical_order, 2))
+         counter = 0
+         do i_loc = 1, poly_order + 1
+            if (coefficients(i_loc, 1) /= 0d0 .OR. coefficients(i_loc, 2) /= 0d0) then
+               counter = counter + 1
+               coefficients_temp(counter, 1) = coefficients(i_loc, 1)
+               coefficients_temp(counter, 2) = coefficients(i_loc, 2)
+            end if
+         end do
 
+         ! Call compute_extra_roots only on the non-zero eigenvalues
+         ! This allocates real_roots_added/imag_roots_added with the original + extra roots
+         call compute_extra_roots(coefficients_temp(:, 1), coefficients_temp(:, 2), &
+                  real_roots_added, imag_roots_added)
+
+         ! total number of non-zero roots after adding extras
+         total_extra = size(real_roots_added) - numerical_order
+
+         ! Resize coefficients to hold non-zero roots (with extras) + zero roots at end
+         deallocate(coefficients)
+         allocate(coefficients(size(real_roots_added) + (poly_order + 1 - numerical_order), 2))
+         coefficients = 0d0
+
+         ! Create perturbed copy for leja ordering
+         allocate(perturbed_real(size(real_roots_added)))
+         allocate(perturbed_imag(size(real_roots_added)))
+         perturbed_real = real_roots_added
+         perturbed_imag = imag_roots_added
+
+         ! Perturb the extra roots so they have unique values for the leja sort
+         counter = numerical_order + 1
+         do i_loc = 1, numerical_order
+            k_loc = 0
+            do j_loc = counter, size(real_roots_added)
+               ! Check if this extra root matches the original
+               if (real_roots_added(j_loc) == coefficients_temp(i_loc, 1) .AND. &
+                   abs(imag_roots_added(j_loc)) == abs(coefficients_temp(i_loc, 2))) then
+                  k_loc = k_loc + 1
+                  perturbed_real(j_loc) = real_roots_added(j_loc) + k_loc * 5e-8
+               end if
             end do
+         end do
 
-            ! If pof > 1e4, we add an extra root, plus one extra for every 1e14
-            if (log10(pof(k_loc)) > 4 .OR. overflow(k_loc) /= 0) then
+         ! Leja order only the non-zero eigenvalues (with extras)
+         call modified_leja(perturbed_real, perturbed_imag, indices)
 
-               ! if real extra_pair_roots counts each distinct real root we're adding
-               ! if imaginary it only counts a pair as one
-               extra_pair_roots(k_loc) = ceiling((log10(pof(k_loc)) + overflow(k_loc) - 4.0)/14.0)
-               total_extra = total_extra + extra_pair_roots(k_loc)
+         ! Reorder the (non-perturbed) roots using the leja ordering
+         coefficients(1:size(real_roots_added), 1) = real_roots_added(indices)
+         coefficients(1:size(real_roots_added), 2) = imag_roots_added(indices)
 
-               ! If imaginary, the pof is the same for the conjugate, let's just set it to -1
-               if (b > 0) then
-                  ! We know the positive imaginary value is first, so the conjugate follows it
-                  pof(k_loc+1) = -1
-                  ! We need the conjugates as well
-                  total_extra = total_extra + extra_pair_roots(k_loc)
+         ! Zero eigenvalues are already zero at the end from the coefficients = 0d0 above
 
-               end if            
-            end if
-         end do
+         ! Cleanup
+         deallocate(coefficients_temp, real_roots_added, imag_roots_added)
+         deallocate(perturbed_real, perturbed_imag, indices)
 
-         ! If we have extra roots we need to resize the coefficients storage
-         if (total_extra > 0) then
-            allocate(coefficients_temp(size(coefficients, 1), size(coefficients, 2)))
-            coefficients_temp(1:size(coefficients, 1), 1:size(coefficients, 2)) = coefficients
-            deallocate(coefficients)
-            allocate(coefficients(size(coefficients_temp, 1) + total_extra, 2))
-            coefficients = 0
-            coefficients(1:size(coefficients_temp, 1), :) = coefficients_temp
-            deallocate(coefficients_temp)
-         end if
-      end if
+      else
 
-      ! Take a copy of the existing roots
-      coefficients_temp = coefficients
+         ! No root adding - just leja order the non-zero eigenvalues
+         ! and put zeros at the end
+         if (numerical_order > 0) then
 
-      if (add_roots) then
-         
-         ! Add the extra copies of roots, ensuring conjugate pairs we add 
-         ! are next to each other
-         counter = size(extra_pair_roots)+1
-         do i_loc = 1, size(extra_pair_roots)
-
-            ! For each extra root pair to add
-            do j_loc = 1, extra_pair_roots(i_loc)
-
-               coefficients(counter, :) = coefficients(i_loc, :)
-               ! Add in the conjugate
-               if (coefficients(i_loc, 2) > 0) then
-                  coefficients(counter+1, 1) = coefficients(i_loc, 1)
-                  coefficients(counter+1, 2) = -coefficients(i_loc, 2)
+            ! Extract non-zero eigenvalues
+            allocate(coefficients_temp(numerical_order, 2))
+            counter = 0
+            do i_loc = 1, poly_order + 1
+               if (coefficients(i_loc, 1) /= 0d0 .OR. coefficients(i_loc, 2) /= 0d0) then
+                  counter = counter + 1
+                  coefficients_temp(counter, 1) = coefficients(i_loc, 1)
+                  coefficients_temp(counter, 2) = coefficients(i_loc, 2)
                end if
+            end do
 
-               ! Store a perturbed root so we have unique values for the leja sort below
-               ! Just peturbing the real value
-               coefficients_temp(counter, 1) = coefficients(i_loc, 1) + j_loc * 5e-8
-               coefficients_temp(counter, 2) = coefficients(i_loc, 2)
-               ! Add in the conjugate
-               if (coefficients(i_loc, 2) > 0) then
-                  coefficients_temp(counter+1, 1) = coefficients(i_loc, 1) + j_loc * 5e-8
-                  coefficients_temp(counter+1, 2) = -coefficients(i_loc, 2)
-               end if            
+            ! Leja order the non-zero eigenvalues
+            call modified_leja(coefficients_temp(:, 1), coefficients_temp(:, 2), indices)
 
-               counter = counter + 1
-               if (coefficients(i_loc, 2) > 0) counter = counter + 1
-            end do
-         end do
-      end if
+            ! Reorder and put zeros at the end
+            coefficients = 0d0
+            coefficients(1:numerical_order, 1) = coefficients_temp(indices, 1)
+            coefficients(1:numerical_order, 2) = coefficients_temp(indices, 2)
 
-      ! ~~~~~~~~~~~~~~
-      ! Now compute a modified leja ordering for stability
-      ! ~~~~~~~~~~~~~~      
-      ! Called with the peturbed extra roots
-      call modified_leja(coefficients_temp(:,1), coefficients_temp(:,2), indices)   
+            deallocate(coefficients_temp, indices)
 
-      ! Reorder the (non-peturbed) roots 
-      coefficients(:,1) = coefficients(indices,1)
-      coefficients(:,2) = coefficients(indices,2)
+         end if
+
+      end if     
 
       ! Cleanup
-      deallocate(coefficients_temp)
       do i_loc = 1, subspace_size+1
          call VecDestroy(V_n(i_loc), ierr)
       end do
-      call VecDestroy(w_j, ierr)
+      call VecDestroy(w_j, ierr)       
 
    end subroutine calculate_gmres_polynomial_roots_newton   
 
@@ -469,7 +706,7 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y)
       type(tVec) :: y
 
       ! Local
-      integer :: order, errorcode
+      integer :: i, errorcode
       PetscErrorCode :: ierr      
       type(mat_ctxtype), pointer :: mat_ctx => null()
 
@@ -487,24 +724,24 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y)
       call VecSet(y, 0d0, ierr)
 
       ! ~~~~~~~~~~~~
-      ! Iterate over the order
+      ! Iterate over the i
       ! ~~~~~~~~~~~~
-      order = 1
-      do while (order .le. size(mat_ctx%real_roots) - 1)
+      i = 1
+      do while (i .le. size(mat_ctx%real_roots) - 1)
 
          ! If real this is easy
-         if (mat_ctx%imag_roots(order) == 0d0) then
+         if (mat_ctx%imag_roots(i) == 0d0) then
 
             ! Skips eigenvalues that are numerically zero - see 
             ! the comment in calculate_gmres_polynomial_roots_newton 
-            if (abs(mat_ctx%real_roots(order)) < 1e-12) then
-               order = order + 1
+            if (abs(mat_ctx%real_roots(i)) < 1e-12) then
+               i = i + 1
                cycle
             end if
 
             ! y = y + theta_i * MF_VEC_TEMP
             call VecAXPY(y, &
-                     1d0/mat_ctx%real_roots(order), &
+                     1d0/mat_ctx%real_roots(i), &
                      mat_ctx%mf_temp_vec(MF_VEC_TEMP), ierr)   
                                           
             ! MF_VEC_DIAG = A * MF_VEC_TEMP
@@ -512,10 +749,10 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y)
             call MatMult(mat_ctx%mat, mat_ctx%mf_temp_vec(MF_VEC_TEMP), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr)
             ! MF_VEC_TEMP = MF_VEC_TEMP - theta_i * MF_VEC_DIAG
             call VecAXPY(mat_ctx%mf_temp_vec(MF_VEC_TEMP), &
-                     -1d0/mat_ctx%real_roots(order), &
+                     -1d0/mat_ctx%real_roots(i), &
                      mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) 
 
-            order = order + 1
+            i = i + 1
 
          ! If imaginary, then have to combine the e'val and its
          ! complex conjugate to keep the arithmetic real
@@ -523,8 +760,8 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y)
          else
 
             ! Skips eigenvalues that are numerically zero
-            if (mat_ctx%real_roots(order)**2 + mat_ctx%imag_roots(order)**2 < 1e-12) then
-               order = order + 2
+            if (mat_ctx%real_roots(i)**2 + mat_ctx%imag_roots(i)**2 < 1e-12) then
+               i = i + 2
                cycle
             end if            
 
@@ -532,27 +769,27 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y)
             call MatMult(mat_ctx%mat, mat_ctx%mf_temp_vec(MF_VEC_TEMP), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr)    
             ! MF_VEC_DIAG = 2 * Re(theta_i) * MF_VEC_TEMP - MF_VEC_DIAG
             call VecAXPBY(mat_ctx%mf_temp_vec(MF_VEC_DIAG), &
-                  2 * mat_ctx%real_roots(order), &
+                  2 * mat_ctx%real_roots(i), &
                   -1d0, &
                   mat_ctx%mf_temp_vec(MF_VEC_TEMP), ierr)
 
             ! y = y + 1/(Re(theta_i)^2 + Imag(theta_i)^2) * MF_VEC_DIAG
             call VecAXPY(y, &
-                     1d0/(mat_ctx%real_roots(order)**2 + mat_ctx%imag_roots(order)**2), &
+                     1d0/(mat_ctx%real_roots(i)**2 + mat_ctx%imag_roots(i)**2), &
                      mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr)  
                      
-            if (order .le. size(mat_ctx%real_roots) - 2) then
+            if (i .le. size(mat_ctx%real_roots) - 2) then
                ! MF_VEC_RHS = A * MF_VEC_DIAG
                call MatMult(mat_ctx%mat, mat_ctx%mf_temp_vec(MF_VEC_DIAG), mat_ctx%mf_temp_vec(MF_VEC_RHS), ierr)    
 
                ! MF_VEC_TEMP = MF_VEC_TEMP - 1/(Re(theta_i)^2 + Imag(theta_i)^2) * MF_VEC_RHS
                call VecAXPY(mat_ctx%mf_temp_vec(MF_VEC_TEMP), &
-                        -1d0/(mat_ctx%real_roots(order)**2 + mat_ctx%imag_roots(order)**2), &
+                        -1d0/(mat_ctx%real_roots(i)**2 + mat_ctx%imag_roots(i)**2), &
                         mat_ctx%mf_temp_vec(MF_VEC_RHS), ierr)               
             end if
 
             ! Skip two evals
-            order = order + 2
+            i = i + 2
 
          end if
       end do
@@ -561,11 +798,11 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y)
       if (mat_ctx%imag_roots(size(mat_ctx%real_roots)) == 0d0) then
 
          ! Skips eigenvalues that are numerically zero
-         if (abs(mat_ctx%real_roots(order)) > 1e-12) then
+         if (abs(mat_ctx%real_roots(size(mat_ctx%real_roots))) > 1e-12) then
 
             ! y = y + theta_i * MF_VEC_TEMP
             call VecAXPBY(y, &
-                     1d0/mat_ctx%real_roots(order), &
+                     1d0/mat_ctx%real_roots(size(mat_ctx%real_roots)), &
                      1d0, &
                      mat_ctx%mf_temp_vec(MF_VEC_TEMP), ierr) 
          end if
@@ -669,11 +906,737 @@ subroutine petsc_matvec_gmres_newton_mf_residual(mat, x, y)
       end do
 
    end subroutine petsc_matvec_gmres_newton_mf_residual
+!------------------------------------------------------------------------------------------------------------------------
+   
+   subroutine mat_mult_powers_share_sparsity_newton(matrix, poly_order, poly_sparsity_order, coefficients, &
+                  reuse_mat, reuse_submatrices, cmat)
+
+      ! Wrapper around mat_mult_powers_share_sparsity_cpu and mat_mult_powers_share_sparsity_kokkos     
+   
+      ! ~~~~~~~~~~
+      ! Input 
+      type(tMat), target, intent(in)                     :: matrix
+      integer, intent(in)                                :: poly_order, poly_sparsity_order
+      PetscReal, dimension(:, :), target, contiguous, intent(inout)    :: coefficients
+      type(tMat), intent(inout)                          :: reuse_mat, cmat
+      type(tMat), dimension(:), pointer, intent(inout)   :: reuse_submatrices
+
+! #if defined(PETSC_HAVE_KOKKOS)                     
+!       integer(c_long_long) :: A_array, B_array, reuse_array
+!       integer :: errorcode, reuse_int_cmat, reuse_int_reuse_mat
+!       PetscErrorCode :: ierr
+!       MatType :: mat_type
+!       Mat :: temp_mat, temp_mat_reuse, temp_mat_compare
+!       PetscScalar normy;
+!       logical :: reuse_triggered_cmat, reuse_triggered_reuse_mat
+!       type(c_ptr)  :: coefficients_ptr
+!       type(tMat) :: reuse_mat_cpu
+!       type(tMat), dimension(:), pointer :: reuse_submatrices_cpu
+! #endif      
+      ! ~~~~~~~~~~
+
+      ! ~~~~~~~~~~
+      ! Special case if we just want to return a gmres polynomial with the sparsity of the diagonal
+      ! This is like a damped Jacobi
+      ! ~~~~~~~~~~
+if (poly_sparsity_order == 0) then
+
+      call build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly_order, &
+               coefficients, cmat)     
+
+      return
+end if
+
+! #if defined(PETSC_HAVE_KOKKOS)    
+
+!       call MatGetType(matrix, mat_type, ierr)
+!       if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. &
+!             mat_type == MATAIJKOKKOS) then                  
+
+!          A_array = matrix%v             
+!          reuse_triggered_cmat = .NOT. PetscObjectIsNull(cmat) 
+!          reuse_triggered_reuse_mat = .NOT. PetscObjectIsNull(reuse_mat) 
+!          reuse_int_cmat = 0
+!          if (reuse_triggered_cmat) then
+!             reuse_int_cmat = 1
+!             B_array = cmat%v
+!          end if
+!          reuse_int_reuse_mat = 0
+!          if (reuse_triggered_reuse_mat) then
+!             reuse_int_reuse_mat = 1
+!          end if         
+!          reuse_array = reuse_mat%v
+!          coefficients_ptr = c_loc(coefficients)
+
+!          ! call mat_mult_powers_share_sparsity_newton_kokkos(A_array, poly_order, poly_sparsity_order, &
+!          !         coefficients_ptr, reuse_int_reuse_mat, reuse_array, reuse_int_cmat, B_array)
+                         
+!          reuse_mat%v = reuse_array
+!          cmat%v = B_array
+
+!          ! If debugging do a comparison between CPU and Kokkos results
+!          if (kokkos_debug()) then
+
+!             ! If we're doing reuse and debug, then we have to always output the result 
+!             ! from the cpu version, as it will have coo preallocation structures set
+!             ! They aren't copied over if you do a matcopy (or matconvert)
+!             ! If we didn't do that the next time we come through this routine 
+!             ! and try to call the cpu version with reuse, it will segfault
+!             if (reuse_triggered_cmat) then
+!                temp_mat = cmat
+!                call MatConvert(cmat, MATSAME, MAT_INITIAL_MATRIX, temp_mat_compare, ierr)  
+!             else
+!                temp_mat_compare = cmat                         
+!             end if            
+
+!             ! Debug check if the CPU and Kokkos versions are the same
+!             ! We send in an empty reuse_mat_cpu here always, as we can't pass through
+!             ! the same one Kokkos uses as it now only gets out the non-local rows we need
+!             ! (ie reuse_mat and reuse_mat_cpu are no longer the same size)
+!             reuse_submatrices_cpu => null()
+!             call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, &
+!                      coefficients, reuse_mat_cpu, reuse_submatrices_cpu, temp_mat)
+!             call destroy_matrix_reuse(reuse_mat_cpu, reuse_submatrices_cpu)         
+                     
+!             call MatConvert(temp_mat, MATSAME, MAT_INITIAL_MATRIX, &
+!                         temp_mat_reuse, ierr)                        
+
+!             call MatAXPYWrapper(temp_mat_reuse, -1d0, temp_mat_compare)
+!             call MatNorm(temp_mat_reuse, NORM_FROBENIUS, normy, ierr)
+!             ! There is floating point compute in these inverses, so we have to be a 
+!             ! bit more tolerant to rounding differences
+!             if (normy .gt. 1d-11 .OR. normy/=normy) then
+!                !call MatFilter(temp_mat_reuse, 1d-14, PETSC_TRUE, PETSC_FALSE, ierr)
+!                !call MatView(temp_mat_reuse, PETSC_VIEWER_STDOUT_WORLD, ierr)
+!                print *, "Kokkos and CPU versions of mat_mult_powers_share_sparsity do not match"
+
+!                call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode)  
+!             end if
+!             call MatDestroy(temp_mat_reuse, ierr)
+!             if (.NOT. reuse_triggered_cmat) then
+!                call MatDestroy(cmat, ierr)
+!             else
+!                call MatDestroy(temp_mat_compare, ierr)
+!             end if
+!             cmat = temp_mat
+!          end if
+
+!       else
+
+!          call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, &
+!                   coefficients, reuse_mat, reuse_submatrices, cmat)       
+
+!       end if
+! #else
+      call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, &
+                  coefficients, reuse_mat, reuse_submatrices, cmat)
+!#endif         
+
+      ! ~~~~~~~~~~
+      
+   end subroutine mat_mult_powers_share_sparsity_newton
+
+!------------------------------------------------------------------------------------------------------------------------
+   
+   subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, coefficients, &
+                  reuse_mat, reuse_submatrices, cmat)
+
+      ! Compute newton powers with the same sparsity
+   
+      ! ~~~~~~~~~~
+      ! Input 
+      type(tMat), target, intent(in)                     :: matrix
+      integer, intent(in)                                :: poly_order, poly_sparsity_order
+      PetscReal, dimension(:, :), target, contiguous, intent(inout)    :: coefficients
+      type(tMat), intent(inout)                          :: reuse_mat, cmat
+      type(tMat), dimension(:), pointer, intent(inout)   :: reuse_submatrices
+      
+      PetscInt :: local_rows, local_cols, global_rows, global_cols
+      PetscInt :: global_row_start, global_row_end_plus_one, row_index_into_submatrix
+      PetscInt :: global_col_start, global_col_end_plus_one, n, ncols, ncols_two, ifree, max_nnzs
+      PetscInt :: i_loc, j_loc, row_size, rows_ao, cols_ao, rows_ad, cols_ad, shift = 0, diag_index
+      integer :: errorcode, match_counter, term
+      integer :: comm_size
+      PetscErrorCode :: ierr      
+      integer, dimension(:), allocatable :: cols_index_one, cols_index_two
+      PetscInt, dimension(:), allocatable :: col_indices_off_proc_array, ad_indices, cols
+      PetscReal, dimension(:), allocatable :: vals
+      type(tIS), dimension(1) :: col_indices, row_indices
+      type(tMat) :: Ad, Ao, mat_sparsity_match, mat_product_save
+      PetscInt, dimension(:), pointer :: colmap
+      logical :: deallocate_submatrices = .FALSE.
+      type(c_ptr) :: vals_c_ptr
+      type(int_vec), dimension(:), allocatable :: symbolic_ones
+      type(real_vec), dimension(:), allocatable :: symbolic_vals
+      integer(c_long_long) A_array
+      MPIU_Comm :: MPI_COMM_MATRIX
+      PetscReal, dimension(:), allocatable :: vals_power_temp, vals_previous_power_temp, temp
+      PetscInt, dimension(:), pointer :: submatrices_ia, submatrices_ja, cols_two_ptr, cols_ptr
+      PetscReal, dimension(:), pointer :: vals_two_ptr, vals_ptr
+      real(c_double), pointer :: submatrices_vals(:)
+      logical :: reuse_triggered
+      PetscBool :: symmetric = PETSC_FALSE, inodecompressed = PETSC_FALSE, done
+      PetscInt, parameter :: one = 1, zero = 0
+      logical :: output_first_complex, skip_add
+      PetscReal :: square_sum
+      integer, dimension(poly_order + 1, 2) :: status_output
+      PetscReal, dimension(poly_sparsity_order+1,2) :: coeffs_contig
+      
+      ! ~~~~~~~~~~  
+
+      if (poly_sparsity_order .ge. size(coefficients)-1) then      
+         print *, "Requested sparsity is greater than or equal to the order"
+         call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode)
+      end if      
+
+      call PetscObjectGetComm(matrix, MPI_COMM_MATRIX, ierr)    
+      ! Get the comm size 
+      call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode)
+
+      ! Get the local sizes
+      call MatGetLocalSize(matrix, local_rows, local_cols, ierr)
+      call MatGetSize(matrix, global_rows, global_cols, ierr)
+      ! This returns the global index of the local portion of the matrix
+      call MatGetOwnershipRange(matrix, global_row_start, global_row_end_plus_one, ierr)  
+      call MatGetOwnershipRangeColumn(matrix, global_col_start, global_col_end_plus_one, ierr)  
+
+      reuse_triggered = .NOT. PetscObjectIsNull(cmat) 
+
+      ! ~~~~~~~~~~
+      ! Compute cmat for all powers up to poly_sparsity_order
+      ! We have to be more careful here than in the monomial case
+      ! In the mononomial case we just compute the matrix powers up to poly_sparsity_order
+      ! and add them times the coefficients to cmat
+      ! Here though we have to build the Newton basis polynomials
+      ! As a rule, the value input here into cmat is correct up to the power
+      ! of poly_sparsity_order, with all other terms being added in the fixed sparsity loops
+      ! below
+      !
+      ! For complex conjugate roots, two terms are computed at a time, but if the sparsity order
+      ! falls in between those two roots, only part of the output cmat is included
+      ! Any remaining terms are output in either mat_sparsity_match or mat_product_save depending 
+      ! on the case
+      ! mat_sparsity_match has either temp or prod in it depending on the case
+      ! mat_product_save only exists in some cases and stores prod from the previous term
+      ! mat_sparsity_match and mat_product_save are always output with the sparsity of sparsity order
+      !
+      ! status_output is an array of length the number of roots
+      ! and has a 1 in each position if that term has been added to the output
+      ! It just helps us keep track of what has gone into cmat and what hasn't
+      ! It breaks up complex conjugate pairs into the first root (i) which has the same power as prod
+      !   tmp = 2 * a * prod
+      !   p = p + 1/(a^2 + b^2) * tmp
+      ! and the second root (i+1) which has the same power as prod * A:
+      !   tmp = -A * prod
+      !   p = p + 1/(a^2 + b^2) * tmp
+      ! ~~~~~~~~~~
+
+      output_first_complex = .FALSE.
+      if (poly_sparsity_order == 1) then
+
+         ! If we have a real first coefficient and a second complex
+         ! we can't call build_gmres_polynomial_newton_inverse_1st_1st as it is only correct
+         ! for valid coefficients up to 1st order (ie both real or both complex)
+         if (coefficients(1,2) == 0d0 .AND. coefficients(2,2) /= 0d0) then
+
+            call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, &
+                  cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, &
+                  status_output, mat_product_save)    
+
+         ! Valid 1st order polynomial, so this case is easy
+         else
+         
+            ! Duplicate & copy the matrix, but ensure there is a diagonal present
+            call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, cmat)  
+            ! Have to be careful to pass in a contiguous piece of memory here
+            coeffs_contig = coefficients(1:poly_sparsity_order + 1, 1:2)
+            call build_gmres_polynomial_newton_inverse_1st_1st(matrix, 1, &
+                     coeffs_contig, &
+                     cmat, mat_sparsity_match, &
+                     status_output)    
+         end if     
+      else
+
+         ! If we're any higher, then we build cmat up to that order
+         ! But we have to be careful because the last root we want to explicitly
+         ! build up to here (ie the power of the matrix given by sparsity_order)
+         ! might be the first root of a complex conjugate pair
+         coeffs_contig = coefficients(1:poly_sparsity_order + 1, 1:2)
+         call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, &
+                  coeffs_contig, &
+                  cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, &
+                  status_output, mat_product_save)
+      end if 
+      
+      ! We know we will never have non-zero locations outside of the highest constrained sparsity power 
+      call MatSetOption(cmat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE,  ierr)     
+      call MatSetOption(cmat, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE,  ierr) 
+      ! We know we are only going to insert local vals
+      ! These options should turn off any reductions in the assembly
+      call MatSetOption(cmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr)     
+      
+      ! ~~~~~~~~~~~~
+      ! If we're in parallel we need to get the off-process rows of matrix that correspond
+      ! to the columns of mat_sparsity_match
+      ! We can therefore do the matmult for every constrained power locally with just that data
+      ! ~~~~~~~~~~~~
+      ! Have to double check comm_size /= 1 as we might be on a subcommunicator and we can't call
+      ! MatMPIAIJGetSeqAIJ specifically if that's the case
+      if (comm_size /= 1) then
+
+         ! ~~~~
+         ! Get the cols
+         ! ~~~~
+         call MatMPIAIJGetSeqAIJ(mat_sparsity_match, Ad, Ao, colmap, ierr)
+
+         call MatGetSize(Ad, rows_ad, cols_ad, ierr)             
+         ! We know the col size of Ao is the size of colmap, the number of non-zero offprocessor columns
+         call MatGetSize(Ao, rows_ao, cols_ao, ierr)         
+
+         ! For the column indices we need to take all the columns of mat_sparsity_match
+         A_array = mat_sparsity_match%v
+
+         ! These are the global indices of the columns we want
+         allocate(col_indices_off_proc_array(cols_ad + cols_ao))
+         allocate(ad_indices(cols_ad))
+         ! Local rows (as global indices)
+         do ifree = 1, cols_ad
+            ad_indices(ifree) = global_row_start + ifree - 1
+         end do
+
+         ! col_indices_off_proc_array is now sorted, which are the global indices of the columns we want
+         call merge_pre_sorted(ad_indices, colmap, col_indices_off_proc_array)
+         deallocate(ad_indices)
+
+         ! Create the sequential IS we want with the cols we want (written as global indices)
+         call ISCreateGeneral(PETSC_COMM_SELF, cols_ad + cols_ao, &
+                     col_indices_off_proc_array, PETSC_USE_POINTER, col_indices(1), ierr) 
+         call ISCreateGeneral(PETSC_COMM_SELF, cols_ao, &
+                     colmap, PETSC_USE_POINTER, row_indices(1), ierr)                      
+
+         ! ~~~~~~~
+         ! Now we can pull out the chunk of matrix that we need
+         ! ~~~~~~~
+
+         ! We need off-processor rows to compute matrix powers   
+         ! Setting this is necessary to avoid an allreduce when calling createsubmatrices
+         ! This will be reset to false after the call to createsubmatrices
+         call MatSetOption(matrix, MAT_SUBMAT_SINGLEIS, PETSC_TRUE, ierr)       
+         
+         ! Now this will be doing comms to get the non-local rows we want
+         ! But only including the columns of the local fixed sparsity, as we don't need all the 
+         ! columns of the non-local entries unless we are doing a full matmatmult
+         ! This returns a sequential matrix
+         if (.NOT. PetscObjectIsNull(reuse_mat)) then
+            reuse_submatrices(1) = reuse_mat
+            call MatCreateSubMatrices(matrix, one, row_indices, col_indices, MAT_REUSE_MATRIX, reuse_submatrices, ierr)
+         else
+            call MatCreateSubMatrices(matrix, one, row_indices, col_indices, MAT_INITIAL_MATRIX, reuse_submatrices, ierr)
+            reuse_mat = reuse_submatrices(1)
+         end if
+         row_size = size(col_indices_off_proc_array)
+         call ISDestroy(col_indices(1), ierr)
+         call ISDestroy(row_indices(1), ierr)
+
+      ! Easy in serial as we have everything we neeed
+      else
+
+         Ad = mat_sparsity_match
+         cols_ad = local_cols
+         allocate(reuse_submatrices(1))
+         deallocate_submatrices = .TRUE.
+         reuse_submatrices(1) = matrix
+         row_size = local_rows
+         allocate(col_indices_off_proc_array(local_rows))
+         do ifree = 1, local_rows
+            col_indices_off_proc_array(ifree) = ifree-1
+         end do
+      end if   
+      
+      ! ~~~~~~~~~
+      ! Now that we are here, reuse_submatrices(1) contains A^poly_sparsity_order with all of the rows
+      ! that correspond to the non-zero columns of matrix
+      ! ~~~~~~~~~      
+
+      ! Have to get the max nnzs of the local and off-local rows we've just retrieved
+      max_nnzs = 0
+      do ifree = global_row_start, global_row_end_plus_one-1     
+         call MatGetRow(matrix, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr)
+         if (ncols > max_nnzs) max_nnzs = ncols
+         call MatRestoreRow(matrix, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr)
+      end do      
+      if (comm_size /= 1) then
+         do ifree = 1, cols_ao            
+            call MatGetRow(reuse_submatrices(1), ifree-1, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr)
+            if (ncols > max_nnzs) max_nnzs = ncols
+            call MatRestoreRow(reuse_submatrices(1), ifree-1, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr)
+         end do
+      end if
+      ! and also the sparsity power
+      do ifree = global_row_start, global_row_end_plus_one-1     
+         call MatGetRow(mat_sparsity_match, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr)
+         if (ncols > max_nnzs) max_nnzs = ncols
+         call MatRestoreRow(mat_sparsity_match, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr)
+      end do 
+      
+      ! ~~~~~~~~
+      ! Get pointers to the sequential aij structure so we don't have to put critical regions
+      ! around the matgetrow
+      ! ~~~~~~~~
+      call MatGetRowIJ(reuse_submatrices(1),shift,symmetric,inodecompressed,n,submatrices_ia,submatrices_ja,done,ierr) 
+      if (.NOT. done) then
+         print *, "Pointers not set in call to MatGetRowIJF"
+         call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode)
+      end if
+      ! Returns the wrong size pointer and can break if that size goes negative??
+      !call MatSeqAIJGetArrayF90(reuse_submatrices(1),submatrices_vals,ierr);
+      A_array = reuse_submatrices(1)%v
+      ! Now we must never overwrite the values in this pointer, and we must 
+      ! never call restore on it, see comment on top of the commented out
+      ! MatSeqAIJRestoreArray below
+      call MatSeqAIJGetArrayF90_mine(A_array, vals_c_ptr)
+      call c_f_pointer(vals_c_ptr, submatrices_vals, shape=[size(submatrices_ja)])
+      
+      ! ~~~~~~~~~~
+      
+      allocate(cols(max_nnzs))
+      allocate(vals(max_nnzs))
+      allocate(vals_power_temp(max_nnzs))
+      allocate(vals_previous_power_temp(max_nnzs))
+      allocate(temp(max_nnzs))
+      allocate(cols_index_one(max_nnzs))
+      allocate(cols_index_two(max_nnzs))      
+
+      ! ~~~~~~~~~~~~
+      ! From here we now have cmat with the correct values up to the power poly_sparsity_order
+      ! and hence we want to add in the sparsity constrained powers
+      ! ~~~~~~~~~~~~
+      
+      ! Now go through and compute the sum of the matrix powers
+      ! We're doing row-wise matmatmults here assuming the fixed sparsity
+      ! We exploit the fact that the subsequent matrix powers can be done
+      ! one row at a time, so we only have to retrieve the needed vals from mat_sparsity_match once
+      do i_loc = 1, local_rows 
+                          
+         ! Get the row of mat_sparsity_match
+         call MatGetRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, &
+                  cols_ptr, vals_ptr, ierr)
+         ! Copying here because mat_sparsity_match and matrix are often the same matrix
+         ! and hence we can only have one active matgetrow
+         ncols = ncols_two
+         cols(1:ncols) = cols_ptr(1:ncols)
+         vals(1:ncols) = vals_ptr(1:ncols)
+         call MatRestoreRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, &
+                  cols_ptr, vals_ptr, ierr)   
+         diag_index = -1
+         ! Find the diagonal index in this row     
+         do j_loc = 1, ncols
+            if (cols(j_loc) == i_loc - 1 + global_row_start) then
+               diag_index = j_loc
+               exit
+            end if
+         end do 
+
+         ! This is just a symbolic for the set of rows given in cols
+         ! Let's just do all the column matching and extraction of the values once
+            
+         ! Allocate some space to store the matching indices
+         allocate(symbolic_ones(ncols))
+         allocate(symbolic_vals(ncols))
+         row_index_into_submatrix = 1
+
+         ! This is a row-wise product
+         do j_loc = 1, ncols
+
+            ! If we're trying to access a local row in matrix
+            if (cols(j_loc) .ge. global_row_start .AND. cols(j_loc) < global_row_end_plus_one) then
+
+               call MatGetRow(matrix, cols(j_loc), ncols_two, &
+                        cols_two_ptr, vals_two_ptr, ierr)
+
+            ! If we're trying to access a non-local row in matrix
+            else
+
+               ! this is local row index we want into reuse_submatrices(1) (as row_indices used to extract are just colmap)  
+               ! We know cols is sorted, so every non-local index will be greater than the last one
+               ! (it's just that cols could have some local ones between different non-local)
+               ! colmap is also sorted and we know every single non-local entry in cols(j_loc) is in colmap
+               do while (row_index_into_submatrix .le. cols_ao .AND. colmap(row_index_into_submatrix) .lt. cols(j_loc))
+                  row_index_into_submatrix = row_index_into_submatrix + 1
+               end do
+
+               ! This is the number of columns
+               ncols_two = submatrices_ia(row_index_into_submatrix+1) - submatrices_ia(row_index_into_submatrix)
+               allocate(cols_two_ptr(ncols_two))
+               ! This is the local column indices in reuse_submatrices(1)
+               cols_two_ptr = submatrices_ja(submatrices_ia(row_index_into_submatrix)+1:submatrices_ia(row_index_into_submatrix+1))
+               ! Because col_indices_off_proc_array (and hence the column indices in reuse_submatrices(1) is sorted, 
+               ! then cols_two_ptr contains the sorted global column indices
+               cols_two_ptr = col_indices_off_proc_array(cols_two_ptr+1)
+
+               ! This is the values
+               vals_two_ptr => &
+                submatrices_vals(submatrices_ia(row_index_into_submatrix)+1:submatrices_ia(row_index_into_submatrix+1))
+            end if
+            
+            ! Search for the matching column
+            ! We're intersecting the global column indices of mat_sparsity_match (cols) and matrix (cols_two_ptr)
+            call intersect_pre_sorted_indices_only(cols(1:ncols), cols_two_ptr, cols_index_one, cols_index_two, match_counter)      
+            
+            ! Don't need to do anything if we have no matches
+            if (match_counter == 0) then 
+               ! Store that we can skip this entry
+               symbolic_ones(j_loc)%ptr => null()
+               symbolic_vals(j_loc)%ptr => null()                        
+            else
+
+               ! These are the matching local column indices for this row of mat_sparsity_match
+               allocate(symbolic_ones(j_loc)%ptr(match_counter))
+               symbolic_ones(j_loc)%ptr = cols_index_one(1:match_counter)
+
+               ! These are the matching values of matrix
+               allocate(symbolic_vals(j_loc)%ptr(match_counter))
+               symbolic_vals(j_loc)%ptr = vals_two_ptr(cols_index_two(1:match_counter)) 
+            end if   
+            
+            ! Restore local row of matrix
+            if (cols(j_loc) .ge. global_row_start .AND. cols(j_loc) < global_row_end_plus_one) then
+               call MatRestoreRow(matrix, cols(j_loc), ncols_two, &
+                        cols_two_ptr, vals_two_ptr, ierr)
+            else
+               deallocate(cols_two_ptr)
+            end if            
+         end do
+         
+         ! Start with the values of mat_sparsity_match in it
+         vals_previous_power_temp(1:ncols) = vals(1:ncols)
+                     
+         ! Loop over any matrix powers
+         ! vals_power_temp stores the prod for this row, and we update this as we go through 
+         ! the term loop
+         term = poly_sparsity_order + 1
+         skip_add = .FALSE.
+         ! If the fixed sparsity root is the second of a complex pair, we start one term earlier
+         ! so that we can compute the correct part of the fixed sparsity product, we just make sure not to add
+         ! anything to cmat as it is already correct up to the fixed sparsity order
+         if (coefficients(term,2) /= 0d0 .AND. .NOT. output_first_complex) then
+            term = term - 1
+            skip_add = .TRUE.
+         end if        
+
+         ! This loop skips the last coefficient
+         do while (term .le. size(coefficients, 1) - 1)
+
+            ! If real
+            if (coefficients(term,2) == 0d0) then
+
+               ! Skips eigenvalues that are numerically zero - see 
+               ! the comment in calculate_gmres_polynomial_roots_newton                
+               if (abs(coefficients(term,1)) < 1e-12) then
+                  term = term + 1
+                  cycle
+               end if
+
+               ! ~~~~~~~~~~~
+               ! Now can add the value to our matrix
+               ! ~~~~~~~~~~~               
+               if (ncols /= 0 .AND. status_output(term, 1) /= 1) then
+                  call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, &
+                        1d0/coefficients(term, 1) * vals_previous_power_temp(1:ncols), ADD_VALUES, ierr)   
+               end if          
+               
+               ! Initialize with previous product before the A*prod subtraction
+               vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols)        
+               
+               ! This is the 1/theta_i * A * prod but where A * prod has fixed sparsity
+               do j_loc = 1, ncols
+
+                  ! If we have no matching columns cycle this row
+                  if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle
+
+                  ! symbolic_vals(j_loc)%ptr has the matching values of A in it
+                  vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) - &
+                           1d0/coefficients(term, 1) * &
+                           symbolic_vals(j_loc)%ptr * vals_previous_power_temp(j_loc)
+               end do
+
+               term = term + 1
+
+            ! If complex
+            else
+
+               ! Skips eigenvalues that are numerically zero - see 
+               ! the comment in calculate_gmres_polynomial_roots_newton                
+               if (coefficients(term,1)**2 + coefficients(term,2)**2 < 1e-12) then
+                  term = term + 2
+                  cycle
+               end if
+
+               square_sum = 1d0/(coefficients(term,1)**2 + coefficients(term,2)**2)
+
+               ! If our fixed sparsity order falls on the first of a complex conjugate pair
+               if (.NOT. skip_add) then
+
+                  ! We skip the 2 * a * prod from the first root of a complex pair if that has already
+                  ! been included in the cmat from build_gmres_polynomial_newton_inverse_full
+                  if (status_output(term, 2) /= 1) then
+                     temp(1:ncols) = 2 * coefficients(term, 1) * vals_previous_power_temp(1:ncols)
+                  else
+                     temp(1:ncols) = 0d0
+                  end if
+
+                  ! This is the -A * prod
+                  do j_loc = 1, ncols
+
+                     ! If we have no matching columns cycle this row
+                     if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle
+
+                     ! symbolic_vals(j_loc)%ptr has the matching values of A in it
+                     temp(symbolic_ones(j_loc)%ptr) = temp(symbolic_ones(j_loc)%ptr) - &
+                              symbolic_vals(j_loc)%ptr * vals_previous_power_temp(j_loc)
+                  end do           
+
+                  ! This is the p = p + 1/(a^2 + b^2) * temp
+                  if (ncols /= 0) then
+                     call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, &
+                           square_sum * temp(1:ncols), ADD_VALUES, ierr)   
+                  end if       
+
+                  ! Here we need to go back in and ensure 2 *a * prod is in temp if we skipped it 
+                  ! above. We know it is already in cmat, but it has to be in temp when we 
+                  ! do the next product 
+                  if (status_output(term, 2) == 1) then
+                     if (output_first_complex) then
+                        temp(1:ncols) = temp(1:ncols) + 2d0 * coefficients(term, 1) * vals_previous_power_temp(1:ncols)
+                     end if                     
+                  end if                  
+
+               ! If our fixed sparsity order falls on the second of a complex conjugate pair
+               else
+
+                  ! In this case we have already included both 2*a*prod - A * prod into cmat
+                  ! But we still have to compute the product for the next term
+                  ! The problem here is that mat_sparsity_match has temp in it in this case, not 
+                  ! the old prod from whatever the previous loop is
+                  ! In that case build_gmres_polynomial_newton_inverse_full also outputs
+                  ! mat_product_save which is the old value of prod but with the sparsity of 
+                  ! mat_sparsity_match (with zeros if needed)
+                  
+                  ! This case only occurs once for each row, so once we've hit this 
+                  ! we will always have our correct prod 
+                  skip_add = .FALSE.
+                  ! temp is output into mat_sparsity_match in this case
+                  temp(1:ncols) = vals_previous_power_temp(1:ncols)
+
+                  ! If sparsity order is 1, the previous product will have been the identity
+                  ! and we don't output it into mat_product_save because that is a trivial case 
+                  ! we can do ourselves
+                  if (term == 1) then
+                     vals_previous_power_temp(1:ncols) = 0d0
+                     if (diag_index /= -1) then
+                        vals_previous_power_temp(diag_index) = 1d0
+                     end if
+
+                  ! In the case the mat_product_save is not the identity, we need to pull it's value out
+                  else
+
+                     call MatGetRow(mat_product_save, i_loc - 1 + global_row_start, ncols_two, &
+                              cols_two_ptr, vals_two_ptr, ierr)
+                     
+                     ! We have guaranteed in the build_gmres_polynomial_newton_inverse_full
+                     ! version that mat_product_save has fixed sparsity
+                     vals_previous_power_temp(1:ncols_two) = vals_two_ptr(1:ncols_two)
+                     
+                     call MatRestoreRow(mat_product_save, i_loc - 1 + global_row_start, ncols_two, &
+                              cols_two_ptr, vals_two_ptr, ierr)                     
+                  end if
+               end if
+
+               ! Now we compute the next product
+               if (term .le. size(coefficients, 1)- 2) then
+
+                  vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols)
+
+                  ! This is prod = prod - 1/(a^2 + b^2) * A * temp
+                  do j_loc = 1, ncols
+
+                     ! If we have no matching columns cycle this row
+                     if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle
+
+                     ! symbolic_vals(j_loc)%ptr has the matching values of A in it
+                     vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) - &
+                              square_sum * &
+                              symbolic_vals(j_loc)%ptr * temp(j_loc)
+                  end do                  
+               end if
+
+               term = term + 2
+
+            end if
+
+            ! This should now have the value of prod in it
+            vals_previous_power_temp(1:ncols) = vals_power_temp(1:ncols)
+         end do    
+
+         ! Final step if last root is real
+         if (coefficients(size(coefficients, 1),2) == 0d0) then
+            if (ncols /= 0 .AND. abs(coefficients(size(coefficients, 1),1)) > 1e-12) then
+               call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, &
+                     1d0/coefficients(size(coefficients, 1), 1) * vals_power_temp(1:ncols), ADD_VALUES, ierr)
+            end if             
+         end if
+
+         ! ~~~~~~~~~~~~~~~
+
+         do j_loc = 1, ncols
+            if (associated(symbolic_ones(j_loc)%ptr)) then
+               deallocate(symbolic_ones(j_loc)%ptr)
+               deallocate(symbolic_vals(j_loc)%ptr)
+            end if      
+         end do  
+         deallocate(symbolic_vals, symbolic_ones)  
+      end do
+
+      call MatRestoreRowIJ(reuse_submatrices(1),shift,symmetric,inodecompressed,n,submatrices_ia,submatrices_ja,done,ierr) 
+      ! We very deliberately don't call restorearray here!
+      ! There is no matseqaijgetarrayread or matseqaijrestorearrayread in Fortran
+      ! Those routines don't increment the PetscObjectStateGet which tells petsc
+      ! the mat has changed. Hence above we directly access the data pointer with 
+      ! a call to MatSeqAIJGetArrayF90_mine and then never write into it
+      ! If we call the restorearrayf90, that does increment the object state
+      ! even though we only read from the array
+      ! That would mean if we pass in a pc->pmat for example, just setting up a pc
+      ! would trigger petsc setting up the pc on every iteration of the pc
+      ! call MatSeqAIJRestoreArray(reuse_submatrices(1),submatrices_vals,ierr);
+
+      ! ~~~~~~~~~~~
+
+      ! Do the assembly, should need zero reductions in this given we've set the 
+      ! flags above
+      call MatAssemblyBegin(cmat, MAT_FINAL_ASSEMBLY, ierr)
+
+      ! Delete temporaries
+      call MatDestroy(mat_sparsity_match, ierr)
+      call MatDestroy(mat_product_save, ierr)
+      if (deallocate_submatrices) then
+         deallocate(reuse_submatrices)
+         reuse_submatrices => null()
+      end if
+
+      deallocate(col_indices_off_proc_array)
+      deallocate(cols, vals, vals_power_temp, vals_previous_power_temp, temp, cols_index_one, cols_index_two)
+
+      ! Finish assembly
+      call MatAssemblyEnd(cmat, MAT_FINAL_ASSEMBLY, ierr) 
+
+         
+   end subroutine mat_mult_powers_share_sparsity_newton_cpu   
 
 ! -------------------------------------------------------------------------------------------------------------------------------
 
    subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, &
                   coefficients, &
+                  poly_sparsity_order, matrix_free, reuse_mat, reuse_submatrices, &
                   inv_matrix)
 
       ! Builds a matrix which is an approximation to the inverse of a matrix using the 
@@ -684,7 +1647,10 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, &
       type(tMat), intent(in)                                      :: matrix
       integer, intent(in)                                         :: poly_order
       PetscReal, dimension(:, :), target, contiguous, intent(inout)    :: coefficients
-      type(tMat), intent(inout)                                   :: inv_matrix
+      integer, intent(in)                                         :: poly_sparsity_order
+      logical, intent(in)                                         :: matrix_free      
+      type(tMat), intent(inout)                                   :: reuse_mat, inv_matrix
+      type(tMat), dimension(:), pointer, intent(inout)            :: reuse_submatrices
 
       ! Local variables
       PetscInt :: global_rows, global_cols, local_rows, local_cols
@@ -692,6 +1658,7 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, &
       PetscErrorCode :: ierr      
       MPIU_Comm :: MPI_COMM_MATRIX
       type(mat_ctxtype), pointer :: mat_ctx=>null()
+      logical :: reuse_triggered
 
       ! ~~~~~~       
 
@@ -708,47 +1675,728 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, &
       ! ~~~~~~~
       ! Just build a matshell that applies our polynomial matrix-free
       ! ~~~~~~~
+      if (matrix_free) then      
+
+         ! If not re-using
+         if (PetscObjectIsNull(inv_matrix)) then
+
+            ! Have to dynamically allocate this
+            allocate(mat_ctx)      
+
+            ! We pass in the polynomial coefficients as the context
+            call MatCreateShell(MPI_COMM_MATRIX, local_rows, local_cols, global_rows, global_cols, &
+                        mat_ctx, inv_matrix, ierr)
+            ! The subroutine petsc_matvec_gmres_newton_mf applies the polynomial inverse
+            call MatShellSetOperation(inv_matrix, &
+                        MATOP_MULT, petsc_matvec_gmres_newton_mf, ierr)
+
+            call MatAssemblyBegin(inv_matrix, MAT_FINAL_ASSEMBLY, ierr)
+            call MatAssemblyEnd(inv_matrix, MAT_FINAL_ASSEMBLY, ierr)
+            ! Have to make sure to set the type of vectors the shell creates
+            call ShellSetVecType(matrix, inv_matrix)          
+            
+            ! Create temporary vectors we use during application
+            ! Make sure to use matrix here to get the right type (as the shell doesn't know about gpus)
+            call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_TEMP), PETSC_NULL_VEC, ierr)          
+            call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_RHS), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr)                
+
+         ! Reusing 
+         else
+            call MatShellGetContext(inv_matrix, mat_ctx, ierr)
+
+         end if
+
+         mat_ctx%real_roots => coefficients(:, 1)
+         mat_ctx%imag_roots => coefficients(:, 2)
+         ! Now because the context reset deallocates the coefficient pointer 
+         ! we want to make sure we don't leak memory, so we use pointer remapping here 
+         ! to turn the 2D coefficient pointer into a 1D that we can store in mat_ctx%coefficients
+         ! and then the deallocate on mat_ctx%coefficients should still delete all the memory
+         mat_ctx%coefficients(1:2*size(coefficients,1)) => coefficients(:, :)
+         ! This is the matrix whose inverse we are applying (just copying the pointer here)
+         mat_ctx%mat = matrix     
+         
+         ! We're done
+         return
+      endif
+
+      reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix)         
+
+      ! ~~~~~~~~~~~~
+      ! If we're here then we want an assembled approximate inverse
+      ! ~~~~~~~~~~~~         
+
+      ! If we're zeroth order poly this is trivial as it's just 1/theta_1 I
+      if (poly_order == 0) then
+
+         call build_gmres_polynomial_newton_inverse_0th_order(matrix, poly_order, coefficients, &
+               inv_matrix) 
+
+         ! Then just return
+         return      
+
+      ! For poly_order 1 and poly_sparsity_order 1 this is easy
+      else if (poly_order == 1 .AND. poly_sparsity_order == 1) then
+
+         ! Duplicate & copy the matrix, but ensure there is a diagonal present
+         call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, inv_matrix)         
+         
+         call build_gmres_polynomial_newton_inverse_1st_1st(matrix, &
+               poly_order, coefficients, inv_matrix)
+
+         ! Then just return
+         return
+
+      end if
+
+      ! If we're constraining sparsity we've built a custom matrix-powers that assumes fixed sparsity
+      if (poly_sparsity_order < poly_order) then    
+
+         ! This routine is a custom one that builds our matrix powers and assumes fixed sparsity
+         ! so that it doen't have to do much comms
+         call mat_mult_powers_share_sparsity_newton(matrix, poly_order, poly_sparsity_order, coefficients, &
+                  reuse_mat, reuse_submatrices, inv_matrix)
+
+         ! ! Then just return
+         return         
+         
+      end if
+
+      ! ~~~~~~~~~~
+      ! We are only here if we don't constrain_sparsity
+      ! ~~~~~~~~~~
+      call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, &
+                  inv_matrix)
+      
+
+   end subroutine build_gmres_polynomial_newton_inverse     
+   
+! -------------------------------------------------------------------------------------------------------------------------------
+
+   subroutine build_gmres_polynomial_newton_inverse_0th_order(matrix, poly_order, coefficients, &
+                  inv_matrix)
+
+      ! Specific 0th order inverse
+
+      ! ~~~~~~
+      type(tMat), intent(in)                            :: matrix
+      integer, intent(in)                               :: poly_order
+      PetscReal, dimension(:, :), target, contiguous, intent(inout)    :: coefficients
+      type(tMat), intent(inout)                         :: inv_matrix
+
+      ! Local variables
+      integer :: errorcode
+      PetscErrorCode :: ierr      
+      logical :: reuse_triggered
+      type(tVec) :: diag_vec   
+
+      ! ~~~~~~      
+      
+      if (poly_order /= 0) then
+         print *, "This is a 0th order inverse, but poly_order is not 0"
+         call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode)
+      end if
+
+      ! Let's create a matrix to represent the inverse diagonal
+      reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix)       
+
+      if (.NOT. reuse_triggered) then
+         call MatCreateVecs(matrix, PETSC_NULL_VEC, diag_vec, ierr)
+      else
+         call MatDiagonalGetDiagonal(inv_matrix, diag_vec, ierr)
+      end if
+
+      ! Must be real as we only have one coefficient
+      call VecSet(diag_vec, 1d0/coefficients(1, 1), ierr)
+
+      ! We may be reusing with the same sparsity
+      if (.NOT. reuse_triggered) then
+         ! The matrix takes ownership of diag_vec and increases ref counter
+         call MatCreateDiagonal(diag_vec, inv_matrix, ierr)
+         call VecDestroy(diag_vec, ierr)
+      else
+         call MatDiagonalRestoreDiagonal(inv_matrix, diag_vec, ierr)
+      end if             
+
+   end subroutine build_gmres_polynomial_newton_inverse_0th_order   
+
+
+! -------------------------------------------------------------------------------------------------------------------------------
+
+   subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly_order, coefficients, &
+                  inv_matrix)
+
+      ! Specific inverse with 0th order sparsity
+
+      ! ~~~~~~
+      type(tMat), intent(in)                            :: matrix
+      integer, intent(in)                               :: poly_order
+      PetscReal, dimension(:, :), target, contiguous, intent(inout)    :: coefficients
+      type(tMat), intent(inout)                         :: inv_matrix
+
+      ! Local variables
+      integer :: i
+      PetscErrorCode :: ierr      
+      logical :: reuse_triggered
+      type(tVec) :: inv_vec, diag_vec, product_vec, temp_vec_A, one_vec, temp_vec_two
+      ! ~~~~~~
+
+      reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix)
+
+       ! Our matrix has to be square
+      call MatCreateVecs(matrix, product_vec, diag_vec, ierr)
+      call MatGetDiagonal(matrix, diag_vec, ierr)
+
+      if (.NOT. reuse_triggered) then
+         call VecDuplicate(diag_vec, inv_vec, ierr)
+      else
+         call MatDiagonalGetDiagonal(inv_matrix, inv_vec, ierr)
+      end if
+      call VecDuplicate(diag_vec, temp_vec_A, ierr)
+      call VecDuplicate(diag_vec, one_vec, ierr)
+      call VecDuplicate(diag_vec, temp_vec_two, ierr)
+      
+      ! Set to zero as we add to it
+      call VecSet(inv_vec, 0d0, ierr) 
+      ! We start with an identity in product_vec    
+      call VecSet(product_vec, 1d0, ierr)
+      call VecSet(one_vec, 1d0, ierr)
+
+      i = 1
+      do while (i .le. size(coefficients, 1) - 1)
+
+         ! temp_vec_A is going to store things with the sparsity of A
+         call VecCopy(diag_vec, temp_vec_A, ierr)     
+
+         ! If real this is easy
+         if (coefficients(i,2) == 0d0) then
+
+            ! Skips eigenvalues that are numerically zero - see 
+            ! the comment in calculate_gmres_polynomial_roots_newton 
+            if (abs(coefficients(i,1)) < 1e-12) then
+               i = i + 1
+               cycle
+            end if        
+
+            call VecAXPY(inv_vec, 1d0/coefficients(i,1), product_vec, ierr)
+
+            ! temp_vec_A = A_ff/theta_k       
+            call VecScale(temp_vec_A, -1d0/coefficients(i,1), ierr)
+            ! temp_vec_A = I - A_ff/theta_k
+            call VecAXPY(temp_vec_A, 1d0, one_vec, ierr)
+            
+            ! product_vec = product_vec * temp_vec_A
+            call VecPointwiseMult(product_vec, product_vec, temp_vec_A, ierr)   
+            
+            i = i + 1
+
+         ! Complex 
+         else
+
+            ! Skips eigenvalues that are numerically zero
+            if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then
+               i = i + 2
+               cycle
+            end if
+
+            ! Compute 2a I - A
+            ! temp_vec_A = -A    
+            call VecScale(temp_vec_A, -1d0, ierr)
+            ! temp_vec_A = 2a I - A_ff
+            call VecAXPY(temp_vec_A, 2d0 * coefficients(i,1), one_vec, ierr) 
+            ! temp_vec_A = (2a I - A_ff)/(a^2 + b^2)
+            call VecScale(temp_vec_A, 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), ierr) 
+
+            ! temp_vec_two = temp_vec_A * product_vec
+            call VecPointwiseMult(temp_vec_two, temp_vec_A, product_vec, ierr)   
+            call VecAXPY(inv_vec, 1d0, temp_vec_two, ierr)         
+
+            if (i .le. size(coefficients, 1) - 2) then
+               ! temp_vec_two = A * temp_vec_two
+               call VecPointwiseMult(temp_vec_two, diag_vec, temp_vec_two, ierr) 
+               call VecAXPY(product_vec, -1d0, temp_vec_two, ierr)
+            end if
+
+            ! Skip two evals
+            i = i + 2
+
+         end if       
+      end do
+
+      ! Final step if last root is real
+      if (coefficients(size(coefficients,1),2) == 0d0) then
+         ! Add in the final term multiplied by 1/theta_poly_order
+
+         ! Skips eigenvalues that are numerically zero
+         if (abs(coefficients(size(coefficients, 1),1)) > 1e-12) then      
+            call VecAXPY(inv_vec, 1d0/coefficients(size(coefficients, 1),1), product_vec, ierr)  
+         end if       
+      end if
+
+      ! We may be reusing with the same sparsity
+      if (.NOT. reuse_triggered) then
+         ! The matrix takes ownership of inv_vec and increases ref counter
+         call MatCreateDiagonal(inv_vec, inv_matrix, ierr)
+         call VecDestroy(inv_vec, ierr)
+      else
+         call MatDiagonalRestoreDiagonal(inv_matrix, inv_vec, ierr)
+      end if  
+
+      call VecDestroy(diag_vec, ierr)
+      call VecDestroy(product_vec, ierr)     
+      call VecDestroy(temp_vec_A, ierr)   
+      call VecDestroy(one_vec, ierr)                      
+      call VecDestroy(temp_vec_two, ierr)
+
+   end subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton     
+
+! -------------------------------------------------------------------------------------------------------------------------------
+
+   subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coefficients, &
+                  inv_matrix, mat_prod_or_temp, status_output)
+
+      ! Specific 1st order with 1st order sparsity
+
+      ! ~~~~~~
+      type(tMat), intent(in)                            :: matrix
+      integer, intent(in)                               :: poly_order
+      PetscReal, dimension(:, :), target, contiguous, intent(inout)    :: coefficients
+      type(tMat), intent(inout)                         :: inv_matrix
+      type(tMat), intent(inout), optional               :: mat_prod_or_temp
+      integer, dimension(:, :), intent(inout), optional :: status_output
+
+      ! Local variables
+      PetscErrorCode :: ierr      
+      logical :: reuse_triggered, output_product
+      PetscReal :: square_sum
+
+      ! ~~~~~~      
+
+      reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix)     
+      output_product = present(mat_prod_or_temp)    
 
-      ! If not re-using
-      if (PetscObjectIsNull(inv_matrix)) then
+      ! Flags to prevent reductions when assembling (there are assembles in the shift)
+      call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) 
+      call MatSetOption(inv_matrix, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE,  ierr)     
+      call MatSetOption(inv_matrix, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE,  ierr)
 
-         ! Have to dynamically allocate this
-         allocate(mat_ctx)      
+      if (output_product) then
+         status_output = 0
+      end if
 
-         ! We pass in the polynomial coefficients as the context
-         call MatCreateShell(MPI_COMM_MATRIX, local_rows, local_cols, global_rows, global_cols, &
-                     mat_ctx, inv_matrix, ierr)
-         ! The subroutine petsc_matvec_gmres_newton_mf applies the polynomial inverse
-         call MatShellSetOperation(inv_matrix, &
-                     MATOP_MULT, petsc_matvec_gmres_newton_mf, ierr)
+      ! We only have two coefficients, so they are either both real or complex conjugates
+      ! If real
+      if (coefficients(1,2) == 0d0) then
+
+         ! Have to be careful here, as we may be first order, but the second eigenvaule
+         ! might have been set to zero thanks to the rank reducing solve 
+         ! So we just check if the second real part is zero and if it is
+         ! we just compute a 0th order inverse - annoyingly we can't call 
+         ! build_gmres_polynomial_newton_inverse_0th_order as that builds a MATDIAGONAL
+         ! and in the tests there is a problem where we reuse the sparsity, in the first
+         ! solve we don't have a zero coefficient but in the second solve we do
+         ! So the mat type needs to remain consistent
+         ! This can't happen in the complex case
+         if (abs(coefficients(2,1)) < 1e-12) then
+
+            ! Set to zero
+            call MatScale(inv_matrix, 0d0, ierr)
+
+            ! Tricky case here as we want to pass out the identity with the 
+            ! sparsity of A
+            if (output_product) then
+               call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr)  
+               call MatShift(mat_prod_or_temp, 1d0, ierr)
+               status_output(1:2, 1) = 1
+            end if                
+
+            ! Then add in the 0th order inverse
+            call MatShift(inv_matrix, 1d0/coefficients(1,1), ierr)       
+            
+            ! Then just return
+            return  
+         end if
 
-         call MatAssemblyBegin(inv_matrix, MAT_FINAL_ASSEMBLY, ierr)
-         call MatAssemblyEnd(inv_matrix, MAT_FINAL_ASSEMBLY, ierr)
-         ! Have to make sure to set the type of vectors the shell creates
-         call ShellSetVecType(matrix, inv_matrix)          
+         ! Could just compute the equivalent mononomial here to save some flops
+         ! but the whole point to doing the Newton form is to avoid the 
+         ! theta_1 * theta_2 that would result
+
+         ! result = -A_ff/theta_1
+         call MatScale(inv_matrix, -1d0/(coefficients(1, 1)), ierr)
+         ! result = I -A_ff/theta_1
+         call MatShift(inv_matrix, 1d0, ierr) 
+         ! If we're doing this as part of fixed sparsity multiply, 
+         ! we need to return mat_prod_or_temp
+         if (output_product) then
+            call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr)  
+         end if
+
+         ! result = 1/theta_2 * (I -A_ff/theta_1)
+         call MatScale(inv_matrix, 1d0/(coefficients(2, 1)), ierr)      
+
+         ! result = 1/theta_1 + 1/theta_2 * (I -A_ff/theta_1)
+         ! Don't need an assemble as there is one called in this
+         call MatShift(inv_matrix, 1d0/(coefficients(1, 1)), ierr)     
          
-         ! Create temporary vectors we use during application
-         ! Make sure to use matrix here to get the right type (as the shell doesn't know about gpus)
-         call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_TEMP), PETSC_NULL_VEC, ierr)          
-         call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_RHS), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr)                
+         if (output_product) then
+            status_output(1:2, 1) = 1
+         end if
 
-      ! Reusing 
+      ! Complex conjugate roots, a +- ib
       else
-         call MatShellGetContext(inv_matrix, mat_ctx, ierr)
+         ! a^2 + b^2
+         square_sum = coefficients(1,1)**2 + coefficients(1,2)**2
+
+         ! Complex conjugate roots
+         ! result = -A_ff
+         call MatScale(inv_matrix, -1d0, ierr)
+         ! result = 2a I - A_ff
+         ! Don't need an assemble as there is one called in this
+         call MatShift(inv_matrix, 2d0 * coefficients(1,1), ierr)      
+         ! If we're doing this as part of fixed sparsity multiply, 
+         ! we need to return mat_prod_or_temp         
+         if (output_product) then
+            call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr)  
+         end if      
+         ! result = 2a I - A_ff/(a^2 + b^2)
+         call MatScale(inv_matrix, 1d0/square_sum, ierr)
+
+         if (output_product) then
+            status_output(1:2, 2) = 1
+         end if
+      end if               
+
+   end subroutine build_gmres_polynomial_newton_inverse_1st_1st     
+   
+
+! -------------------------------------------------------------------------------------------------------------------------------
+
+   subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, &
+                  inv_matrix, mat_prod_or_temp, poly_sparsity_order, output_first_complex, &
+                  status_output, mat_product_save)
+
+      ! No constrained sparsity by default
+      ! If you pass in mat_prod_or_temp, poly_sparsity_order, output_first_complex, status_output and 
+      ! mat_product_save
+      ! then it will build part of the terms, up to poly_sparsity_order, and return the product
+      ! in mat_prod_or_temp and mat_product_save that you need to compute the rest of the fixed sparsity terms
+
+      ! ~~~~~~
+      type(tMat), intent(in)                            :: matrix
+      integer, intent(in)                               :: poly_order
+      PetscReal, dimension(:, :), target, contiguous, intent(inout)    :: coefficients
+      type(tMat), intent(inout)                         :: inv_matrix
+      type(tMat), intent(inout), optional               :: mat_prod_or_temp, mat_product_save
+      integer, intent(in), optional                     :: poly_sparsity_order
+      logical, intent(inout), optional                  :: output_first_complex
+      integer, dimension(:, :), intent(inout), optional :: status_output
+
+      ! Local variables
+      PetscErrorCode :: ierr      
+      logical :: reuse_triggered, output_product, first_complex
+      integer :: i, i_sparse, sparsity_order
+      type(tMat) :: mat_product, temp_mat_A, temp_mat_two, temp_mat_three, mat_product_k_plus_1
+      PetscReal :: square_sum, a_coeff
+
+      ! ~~~~~~      
 
+      reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix)  
+      output_product = present(mat_prod_or_temp)
+
+      if (.NOT. reuse_triggered) then
+         ! Duplicate & copy the matrix, but ensure there is a diagonal present
+         call mat_duplicate_copy_plus_diag(matrix, .FALSE., inv_matrix)
+      end if      
+
+      ! Set to zero as we add in each product of terms
+      call MatScale(inv_matrix, 0d0, ierr)
+
+      ! Don't set any off processor entries so no need for a reduction when assembling
+      call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr)  
+
+      ! We start with an identity in mat_product
+      call generate_identity(matrix, mat_product)
+      if (output_product) then
+         status_output = 0
       end if
+      sparsity_order = poly_order
+      if (present(poly_sparsity_order)) then
+         sparsity_order = poly_sparsity_order
+      end if
+
+      ! If we're going to output the product as part of a fixed sparsity multiply,
+      ! we may be asking to constrain the sparsity to a power in between order and order + 2
+      ! if there is a complex root at poly_sparsity_order
+      ! ie if we have roots (theta_1^r, theta_2^c, theta_3^c, theta_4^r) 
+      ! where ^r means a purely real root and ^c means a complex root 
+      ! want poly_sparsity_order = 1, we can't process all the way up to theta_3^c as that would 
+      ! compute up to an A^2 term which is beyond our sparsity constraint
+
+      ! i_sparse tells us how many roots we are going to process
+      ! Normally this would just be size(coefficients, 1) and the loop below goes up 
+      ! to size(coefficients, 1) - 1. The last real root gets its final term added outside the loop
+      ! and if the last root is complex then we only have to hit the first of the pair in the loop
+      !
+      ! If we have fixed sparsity:
+      !
+      ! if the fixed sparsity root is real then we want to set i_sparse to poly_sparsity_order+1
+      ! so we hit the roots up to poly_sparsity_order in the loop and then we take care of the 
+      ! poly_sparsity_order + 1 root outside the loop
+      !
+      ! if the fixed sparsity root is complex but poly_sparsity_order + 1 hits the second of the pair
+      !    then we only need to set i_sparse to poly_sparsity_order + 1 so we only hit the first
+      !    pair in the loop below
+      ! 
+      ! if the fixed sparsity root is complex but poly_sparsity_order + 1 hits the first of the pair
+      !    then we need to set i_sparse to poly_sparsity_order + 2
+      !    otherwise we would never hit the first pair
+
+      i_sparse = size(coefficients, 1)
+      first_complex = .FALSE.
+
+      if (output_product) then
+
+         output_first_complex = .FALSE.
+         if (output_product) then
+            i_sparse = sparsity_order + 1
+
+            ! If the last root is real we don't have to do anything
+            if (coefficients(i_sparse,2) /= 0d0) then
+
+               ! If the one before is real, then we know we're on the first
+               if (coefficients(i_sparse-1,2) == 0d0) then
+                  output_first_complex = .TRUE.
+                  ! See discussion above
+                  i_sparse = i_sparse + 1
+
+               ! If the one before is complex
+               else
+
+                  ! Check if the distance between the fixed sparsity root and the one before
+                  ! If > zero then they are not complex conjugates and hence we are on the first of the pair         
+                  if (abs(coefficients(i_sparse,1) - coefficients(i_sparse-1,1))/coefficients(i_sparse,1) > 1e-14 .OR. &
+                        abs(coefficients(i_sparse,2) + coefficients(i_sparse-1,2))/coefficients(i_sparse,2) > 1e-14) then
+                     output_first_complex = .TRUE.
+                     i_sparse = i_sparse + 1
+                  end if            
+               end if
+            end if
+         end if 
+         first_complex = output_first_complex
+      end if
+
+      ! ~~~~~~~~~~~~
+      ! Iterate over the i
+      ! This is basically the same as the MF application but we have to build the powers
+      ! ~~~~~~~~~~~~      
+      i = 1
+      ! Loop through to one fewer than the number of roots
+      ! We're always building up the next product
+      do while (i .le. i_sparse - 1)
+
+         ! Duplicate & copy the matrix, but ensure there is a diagonal present
+         ! temp_mat_A is going to store things with the sparsity of A
+         if (PetscObjectIsNull(temp_mat_A)) then
+            call mat_duplicate_copy_plus_diag(matrix, .FALSE., temp_mat_A)     
+         else
+            ! Can reuse the sparsity 
+            call mat_duplicate_copy_plus_diag(matrix, .TRUE., temp_mat_A)     
+         end if         
+
+         ! If real this is easy
+         if (coefficients(i,2) == 0d0) then
+
+            ! Skips eigenvalues that are numerically zero
+            ! We still compute the entries as as zero because we need the sparsity
+            ! to be correct for the next iteration
+            if (abs(coefficients(i,1)) < 1e-12) then
+               square_sum = 0
+            else
+               square_sum = 1d0/coefficients(i,1)
+            end if        
+
+            ! Then add the scaled version of each product
+            if (i == 1) then
+               ! If i == 1 then we know mat_product is identity so we can do it directly
+               call MatShift(inv_matrix, 1d0/coefficients(i,1), ierr)  
+            else
+               if (reuse_triggered) then
+                  ! If doing reuse we know our nonzeros are a subset
+                  call MatAXPY(inv_matrix, square_sum, mat_product, SUBSET_NONZERO_PATTERN, ierr)
+               else
+                  ! Have to use the DIFFERENT_NONZERO_PATTERN here
+                  call MatAXPYWrapper(inv_matrix, square_sum, mat_product)
+               end if
+            end if
+            if (output_product) status_output(i, 1) = 1
+
+            ! temp_mat_A = A_ff/theta_k       
+            call MatScale(temp_mat_A, -square_sum, ierr)
+            ! temp_mat_A = I - A_ff/theta_k
+            call MatShift(temp_mat_A, 1d0, ierr)    
+            
+            ! mat_product_k_plus_1 = mat_product * temp_mat_A
+            if (i == 1) then
+               ! If i == 1 then we know mat_product is identity so we can just copy
+               call MatDestroy(mat_product, ierr)
+               call MatConvert(temp_mat_A, MATSAME, MAT_INITIAL_MATRIX, mat_product, ierr)  
+            else
+               call MatMatMult(temp_mat_A, mat_product, &
+                     MAT_INITIAL_MATRIX, 1.5d0, mat_product_k_plus_1, ierr)      
+               call MatDestroy(mat_product, ierr)  
+               mat_product = mat_product_k_plus_1  
+            end if
+            
+            ! We copy out the last product if we're doing this as part of a fixed sparsity multiply
+            if (output_product .AND. i == i_sparse - 1) then
+               call MatDestroy(mat_prod_or_temp, ierr)
+               call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr)  
+            end if
+            
+            i = i + 1
+
+         ! Complex 
+         else
+
+            ! Skips eigenvalues that are numerically zero
+            if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then
+               square_sum = 0
+               a_coeff = 0
+             else  
+               square_sum = 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2)
+               a_coeff = 2d0 * coefficients(i,1)
+            end if
+
+            ! If our fixed sparsity root is the first of a complex conjugate pair
+            ! We want to pass out mat_product and only add that to inv_matrix
+            ! This is equivalent to only part of tmp on Line 9 of Loe
+            ! The fixed sparsity loop will then finish the tmp with the term -A * prod/(a^2+b^2)
+            ! as this is the part that would increase the sparsity beyond poly_sparsity_order            
+            if (i == sparsity_order + 1 .AND. first_complex) then
+
+               ! Copy mat_product into temp_mat_two
+               call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr)
+
+               ! temp_mat_two = 2a * mat_product
+               call MatScale(temp_mat_two, a_coeff, ierr)   
+               if (output_product) status_output(i, 2) = 1
+
+               ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply
+               if (output_product .AND. i > i_sparse - 2) then
+                  call MatDestroy(mat_prod_or_temp, ierr)
+                  call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr)  
+               end if  
+               
+            ! Just do the normal loop
+            else
+
+               ! temp_mat_A = -A    
+               call MatScale(temp_mat_A, -1d0, ierr)
+               ! temp_mat_A = 2a I - A_ff
+               call MatShift(temp_mat_A, a_coeff, ierr)   
+               if (output_product) then
+                  status_output(i, 2) = 1
+                  status_output(i+1, 2) = 1
+               end if
+
+               if (i == 1) then
+                  ! If i == 1 then we know mat_product is identity so we can do it directly
+                  call MatConvert(temp_mat_A, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr)  
+               else
+                  ! temp_mat_two = temp_mat_A * mat_product
+                  call MatMatMult(temp_mat_A, mat_product, &
+                        MAT_INITIAL_MATRIX, 1.5d0, temp_mat_two, ierr)     
+               end if    
+               
+               ! We copy out the last part of the old product if we're doing this as part of a fixed sparsity multiply
+               if (output_product .AND. i > i_sparse - 2) then
+
+                  call MatDestroy(mat_prod_or_temp, ierr)
+                  call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) 
+                  ! If i == 1 then we know mat_product is the identity and we don't bother 
+                  ! to write it out, we just have some custom code in the product given its trivial
+                  if (i /= 1) then 
+                     ! This ensures it has the matching sparsity
+                     call MatDestroy(mat_product_save, ierr)
+                     call MatConvert(mat_prod_or_temp, MATSAME, MAT_INITIAL_MATRIX, mat_product_save, ierr)  
+                     ! This zeros mat_product_save and then puts mat_product into the sparsity pattern 
+                     ! of mat_prod_or_temp
+                     call MatCopy(mat_product, mat_product_save, DIFFERENT_NONZERO_PATTERN, ierr)     
+                  end if              
+               end if                 
+            end if
+
+            ! Then add the scaled version of each product
+            if (reuse_triggered) then
+               ! If doing reuse we know our nonzeros are a subset
+               call MatAXPY(inv_matrix, square_sum, &
+                        temp_mat_two, SUBSET_NONZERO_PATTERN, ierr)
+            else
+               ! Have to use the DIFFERENT_NONZERO_PATTERN here
+               call MatAXPYWrapper(inv_matrix, square_sum, temp_mat_two)
+            end if            
 
-      mat_ctx%real_roots => coefficients(:, 1)
-      mat_ctx%imag_roots => coefficients(:, 2)
-      ! Now because the context reset deallocates the coefficient pointer 
-      ! we want to make sure we don't leak memory, so we use pointer remapping here 
-      ! to turn the 2D coefficient pointer into a 1D that we can store in mat_ctx%coefficients
-      ! and then the deallocate on mat_ctx%coefficients should still delete all the memory
-      mat_ctx%coefficients(1:2*size(coefficients,1)) => coefficients(:, :)
-      ! This is the matrix whose inverse we are applying (just copying the pointer here)
-      mat_ctx%mat = matrix       
-
-   end subroutine build_gmres_polynomial_newton_inverse       
+            if (i .le. i_sparse - 2) then
+
+               ! temp_mat_three = matrix * temp_mat_two
+               call MatMatMult(matrix, temp_mat_two, &
+                     MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr)     
+               call MatDestroy(temp_mat_two, ierr)   
+               if (output_product) then               
+                  status_output(i, 2) = 1
+               end if
+
+               ! Then add the scaled version of each product
+               if (reuse_triggered) then
+                  ! If doing reuse we know our nonzeros are a subset
+                  call MatAXPY(mat_product, -square_sum, &
+                           temp_mat_three, SUBSET_NONZERO_PATTERN, ierr)
+               else
+                  ! Have to use the DIFFERENT_NONZERO_PATTERN here
+                  call MatAXPYWrapper(mat_product, -square_sum, temp_mat_three)
+               end if               
+
+               ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply
+               if (output_product .AND. .NOT. first_complex) then
+                  call MatDestroy(mat_prod_or_temp, ierr)
+                  call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr)      
+               end if                 
+
+               call MatDestroy(temp_mat_three, ierr) 
+            else
+               call MatDestroy(temp_mat_two, ierr)  
+            end if
+
+            ! Skip two evals
+            i = i + 2
+
+         end if       
+      end do
+
+      ! Final step if last root is real
+      if (.NOT. first_complex) then
+         if (coefficients(i_sparse,2) == 0d0) then
+            ! Add in the final term multiplied by 1/theta_poly_order
+
+            ! Skips eigenvalues that are numerically zero
+            if (abs(coefficients(i_sparse,1)) > 1e-12) then     
+               
+               if (reuse_triggered) then
+                  ! If doing reuse we know our nonzeros are a subset
+                  call MatAXPY(inv_matrix, 1d0/coefficients(i_sparse,1), mat_product, SUBSET_NONZERO_PATTERN, ierr)
+               else
+                  ! Have to use the DIFFERENT_NONZERO_PATTERN here
+                  call MatAXPYWrapper(inv_matrix, 1d0/coefficients(i_sparse,1), mat_product)
+               end if     
+               if (output_product) status_output(i_sparse, 1) = 1
+            end if       
+         end if   
+      end if     
+
+      call MatDestroy(temp_mat_A, ierr)
+      call MatDestroy(mat_product, ierr)       
+
+   end subroutine build_gmres_polynomial_newton_inverse_full   
 
 ! -------------------------------------------------------------------------------------------------------------------------------
 
diff --git a/src/Gmres_Polyk.kokkos.cxx b/src/Gmres_Polyk.kokkos.cxx
index 071e646c..a1279ee2 100644
--- a/src/Gmres_Polyk.kokkos.cxx
+++ b/src/Gmres_Polyk.kokkos.cxx
@@ -67,14 +67,18 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in
    Mat *submatrices;
 
    // Pull out the nonlocal parts of the input mat we need
+   const PetscInt *colmap_input_mat;
+   PetscInt cols_ao_input = 0;
    if (mpi)
    {
-      PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local_input, &mat_nonlocal_input, NULL));
+      PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local_input, &mat_nonlocal_input, &colmap_input_mat));
 
       PetscCallVoid(MatMPIAIJGetSeqAIJ(*mat_sparsity_match, &mat_local_sparsity, &mat_nonlocal_sparsity, &colmap_mat_sparsity_match));
       PetscCallVoid(MatGetSize(mat_nonlocal_sparsity, &rows_ao, &cols_ao));
       PetscCallVoid(MatGetSize(mat_local_sparsity, &rows_ad, &cols_ad));
-
+      PetscInt rows_ao_input;
+      PetscCallVoid(MatGetSize(mat_nonlocal_input, &rows_ao_input, &cols_ao_input));
+      
       // We need to pull out all the columns in the sparsity mat
       // and the nonlocal rows that correspond to the nonlocal columns
       // from the input mat      
@@ -175,6 +179,71 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in
    PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local_output, &device_local_i_output, &device_local_j_output, &device_local_vals_output, &mtype));
    if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal_output, &device_nonlocal_i_output, &device_nonlocal_j_output, &device_nonlocal_vals_output, &mtype));
 
+   // ~~~~~~~~~~~~~~
+   // Build a mapping from the input matrix's nonlocal column indices to the 
+   // sparsity matrix's column space ("local" submat column space), which is defined as:
+   //   [0..cols_ad-1] for local columns, [cols_ad..cols_ad+cols_ao-1] for sparsity colmap columns
+   // 
+   // When doing the matrix-matrix product:
+   // 1. We need to compare local cols from local rows 
+   // We need to access the local input matrix and we 
+   // can do that directly given local indices are the same
+   //
+   // 2. We need to compare nonlocal cols from non-local rows
+   // We need to access submat for this which now only has the non-local rows in it
+   // Those will have a "local" column index that matches col_indices_off_proc_array given 
+   // we create it with MatCreateSubMatrices
+   //
+   // 3. We need to compare nonlocal cols from local rows
+   // We need to access the input_matrix for this
+   // But (for higher order fixed sparsity) the colmap of the input matrix is not the same
+   // as the colmap of the sparsity matrix
+   // So below we create a mapping that converts from the input matrix's nonlocal column indices
+   // to the "local" column indices of the submat (which correspond to the sparsity matrix's column space) for the nonlocal columns
+   // If there are not matching entries in the sparsity colmap, we use a large sentinel value that will never
+   // match any col_orig and preserves sorted order.
+   //
+   // This mapping is needed because the input matrix and sparsity matrix may have
+   // different colmaps when poly_sparsity_order >= 2.
+   // ~~~~~~~~~~~~~~
+
+   // Use a sentinel larger than any valid column index
+   const PetscInt COLMAP_NOT_FOUND = cols_ad + cols_ao + 1;
+
+   auto input_nonlocal_to_submat_col_d = PetscIntKokkosView("input_nonlocal_to_submat_col_d", mpi ? cols_ao_input : 1);
+   if (mpi && cols_ao_input > 0)
+   {
+      // Build the mapping on the host
+      // Both colmaps are sorted, so we can do a merge-style scan
+      auto input_nonlocal_to_submat_col_h = Kokkos::create_mirror_view(input_nonlocal_to_submat_col_d);
+      PetscInt sparsity_colmap_idx = 0;
+      for (PetscInt k = 0; k < cols_ao_input; k++)
+      {
+         PetscInt global_col = colmap_input_mat[k];
+         // Advance the sparsity colmap index (both are sorted)
+         while (sparsity_colmap_idx < cols_ao && colmap_mat_sparsity_match[sparsity_colmap_idx] < global_col)
+         {
+            sparsity_colmap_idx++;
+         }
+         if (sparsity_colmap_idx < cols_ao && colmap_mat_sparsity_match[sparsity_colmap_idx] == global_col)
+         {
+            input_nonlocal_to_submat_col_h(k) = cols_ad + sparsity_colmap_idx;
+         }
+         else
+         {
+            // Not found — use sentinel value that preserves sort order
+            // Since colmap_input is sorted and colmap_sparsity is sorted,
+            // if an entry is missing it's between two found entries,
+            // so we assign a value that maintains monotonicity
+            input_nonlocal_to_submat_col_h(k) = COLMAP_NOT_FOUND;
+         }
+      }
+      Kokkos::deep_copy(input_nonlocal_to_submat_col_d, input_nonlocal_to_submat_col_h);
+      // Log copy with petsc
+      bytes = input_nonlocal_to_submat_col_h.extent(0) * sizeof(PetscInt);
+      PetscCallVoid(PetscLogCpuToGpu(bytes));
+   }
+
    // ~~~~~~~~~~~~~~
    // Find maximum non-zeros per row for sizing scratch memory
    // ~~~~~~~~~~~~~~
@@ -358,8 +427,11 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in
             while (idx_col_of_row_i < ncols_row_i && idx_col_of_row_j < ncols_row_of_col_j) {
 
                // The col_target is the column we are trying to match in the row of column j
-               // We always convert it to the "local" indexing as if it were in the columns of the submat, ie 
-               // the column indexing of [local cols; local cols + 0:cols_ao-1]
+               // We convert everything to the submat "local" column space for comparison, ie
+               // the column indexing of [0..cols_ad-1 for local cols; cols_ad+k for sparsity colmap[k]]
+               // When the input matrix and sparsity matrix have different colmaps
+               // (poly_sparsity_order >= 2), we use the input_nonlocal_to_submat_col_d mapping
+               // to convert the input matrix's nonlocal column indices to the sparsity colmap space
                PetscInt col_target;
                if (row_of_col_j_local)
                {
@@ -369,8 +441,11 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in
                   }
                   else
                   {
-                     // Convert to "local" column index of submat by adding cols_ad
-                     col_target = device_nonlocal_j_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j] + cols_ad;
+                     // This is the case where we need to access non-local columns in local rows of input_matrix
+                     // and hence we need our mapping
+                     // Convert nonlocal column index from input matrix's colmap space
+                     // to the to "local" column index of submat
+                     col_target = input_nonlocal_to_submat_col_d(device_nonlocal_j_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j]);
                   }
                }
                else
@@ -390,40 +465,45 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in
                   // Convert to "local" column index of submat by adding cols_ad
                   col_orig = device_nonlocal_j_sparsity[device_nonlocal_i_sparsity[i] + (idx_col_of_row_i - local_cols_row_i)] + cols_ad;
                }
-               
-               if (col_orig < col_target) {
-                  // Original column is smaller, move to next original column
-                  idx_col_of_row_i++;
-               } else if (col_orig > col_target) {
-                  // Target column is smaller, move to next target column
+
+               // Skip entries where the input column doesn't exist in the sparsity pattern
+               if (col_target == COLMAP_NOT_FOUND) {
                   idx_col_of_row_j++;
-               // We've found a matching index and hence we can do our compute
                } else {
-
-                  PetscReal val_target;
-                  if (row_of_col_j_local)
-                  {
-                     if (idx_col_of_row_j < local_cols_row_of_col_j)
+                  if (col_orig < col_target) {
+                     // Original column is smaller, move to next original column
+                     idx_col_of_row_i++;
+                  } else if (col_orig > col_target) {
+                     // Target column is smaller, move to next target column
+                     idx_col_of_row_j++;
+                  // We've found a matching index and hence we can do our compute
+                  } else {
+
+                     PetscReal val_target;
+                     if (row_of_col_j_local)
                      {
-                        val_target = device_local_vals_input[device_local_i_input[row_of_col_j] + idx_col_of_row_j];
+                        if (idx_col_of_row_j < local_cols_row_of_col_j)
+                        {
+                           val_target = device_local_vals_input[device_local_i_input[row_of_col_j] + idx_col_of_row_j];
+                        }
+                        else
+                        {
+                           val_target = device_nonlocal_vals_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j];
+                        }
                      }
                      else
                      {
-                        val_target = device_nonlocal_vals_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j];
+                        val_target = device_submat_vals[device_submat_i[row_of_col_j] + idx_col_of_row_j];
                      }
-                  }
-                  else
-                  {
-                     val_target = device_submat_vals[device_submat_i[row_of_col_j] + idx_col_of_row_j];
-                  }                     
 
-                  // Has to be atomic! Potentially lots of contention so maybe not 
-                  // the most performant way to do this
-                  Kokkos::atomic_add(&vals_temp[idx_col_of_row_i], vals_prev[j] * val_target);
+                     // Has to be atomic! Potentially lots of contention so maybe not 
+                     // the most performant way to do this
+                     Kokkos::atomic_add(&vals_temp[idx_col_of_row_i], vals_prev[j] * val_target);
 
-                  // Move forward in both arrays
-                  idx_col_of_row_i++;
-                  idx_col_of_row_j++;
+                     // Move forward in both arrays
+                     idx_col_of_row_i++;
+                     idx_col_of_row_j++;
+                  }
                }
             }
          });      
diff --git a/src/PCPFLAREINV.c b/src/PCPFLAREINV.c
index b7f3a8e3..8b6826bd 100644
--- a/src/PCPFLAREINV.c
+++ b/src/PCPFLAREINV.c
@@ -18,8 +18,8 @@ PETSC_EXTERN void calculate_and_build_approximate_inverse_c(Mat *input_mat, Pets
 //
 // PFLAREINV_POWER      - GMRES polynomial with the power basis 
 // PFLAREINV_ARNOLDI    - GMRES polynomial with the arnoldi basis 
-// PFLAREINV_NEWTON     - GMRES polynomial with the newton basis with extra roots for stability - can only be used matrix-free atm      
-// PFLAREINV_NEWTON_NO_EXTRA     - GMRES polynomial with the newton basis without extra roots - can only be used matrix-free atm      
+// PFLAREINV_NEWTON     - GMRES polynomial with the newton basis with extra roots for stability 
+// PFLAREINV_NEWTON_NO_EXTRA     - GMRES polynomial with the newton basis without extra roots    
 // PFLAREINV_NEUMANN    - Neumann polynomial
 // PFLAREINV_SAI        - SAI - cannot be used matrix-free atm
 // PFLAREINV_ISAI       - Incomplete SAI - cannot be used matrix-free atm
@@ -335,11 +335,6 @@ static PetscErrorCode PCSetUp_PFLAREINV_c(PC pc)
    // ~~~~~~~
    PetscCall(PCPFLAREINVGetType(pc, &type));
 
-   // Newton has to be matrix free
-   if (type == PFLAREINV_NEWTON || type == PFLAREINV_NEWTON_NO_EXTRA)
-   {
-      PetscCheck(inv_data->matrix_free, comm, PETSC_ERR_ARG_WRONGSTATE, "GMRES polynomial with Newton basis must be applied matrix-free");
-   }
    // SAI/ISAI can't be matrix free
    if (type == PFLAREINV_SAI || type == PFLAREINV_ISAI)
    {
diff --git a/tests/Makefile b/tests/Makefile
index 5dd2e454..f86f49ce 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -93,6 +93,8 @@ run_tests_load_serial:
 	./ex6 -f data/mat_stream_2364 -pc_type pflareinv -ksp_max_it 21		
 	@echo "Test single level GMRES polynomial preconditioning for hyperbolic streaming problem in C"
 	./ex6 -f data/mat_stream_2364 -pc_type pflareinv -pc_pflareinv_type power -ksp_max_it 21	
+	@echo "Test single level Newton GMRES polynomial preconditioning for hyperbolic streaming problem in C"
+	./ex6 -f data/mat_stream_2364 -pc_type pflareinv -pc_pflareinv_type newton -ksp_max_it 21		
 # 
 	@echo ""
 	@echo "Test single level GMRES polynomial preconditioning with the Newton basis matrix-free for hyperbolic streaming problem in C"
@@ -121,6 +123,42 @@ run_tests_load_serial:
 	@echo ""
 	@echo "Test AIRG with GMRES polynomials in indefinite problem with zero diagonals"
 	./ex6 -f data/e05r0100_petsc -b_in_f 0 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 26
+	@echo "Test AIRG with Newton GMRES polynomials in indefinite problem with zero diagonals"
+	./ex6 -f data/e05r0100_petsc -b_in_f 0 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 26			
+#
+	@echo ""
+	@echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 2nd order"
+	./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 2
+	@echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 3rd order"
+	./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 3	
+#
+	@echo ""
+	@echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders"
+	@set -e; for order in 0 1 2 3 4 5 6; do \
+		echo "--- Testing order = $$order ---"; \
+		./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \
+	done
+	@echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders and fixed sparsity"
+	@set -e; for order in 2 3 4 5 6; do \
+		echo "--- Testing order = $$order ---"; \
+		for sparsity in $$(seq 1 $$(($$order - 1))); do \
+			echo "  --- Testing sparsity order = $$sparsity ---"; \
+				./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$sparsity; \
+		done; \
+	done
+#
+	@echo ""
+	@echo "Test Newton GMRES polynomials order 60 matrix-free with added roots in market matrix problem 1138"
+	./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv  -pc_pflareinv_type newton -pc_pflareinv_order 60 -pc_pflareinv_matrix_free -ksp_norm_type unpreconditioned -ksp_max_it 6
+	@echo "Test Newton GMRES polynomials order 60 with fixed sparsity with added roots in market matrix problem 1138"
+	./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv  -pc_pflareinv_type newton -pc_pflareinv_order 60 -ksp_norm_type unpreconditioned -ksp_max_it 6
+	@echo "Test Newton GMRES polynomials order 120 matrix-free with added roots in market matrix problem 1138"
+	./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv  -pc_pflareinv_type newton -pc_pflareinv_order 120 -pc_pflareinv_matrix_free -ksp_norm_type unpreconditioned -ksp_max_it 5
+	@echo "Test Newton GMRES polynomials order 120 with fixed sparsity with added roots in market matrix problem 1138"
+	./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv  -pc_pflareinv_type newton -pc_pflareinv_order 120 -ksp_norm_type unpreconditioned -ksp_max_it 5		
+#
+	@echo "Test Newton AIRG with GMRES polynomials for hyperbolic streaming problem, matrix-free smoothing"
+	./ex12f -f data/mat_stream_2364 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -pc_air_matrix_free_polys -ksp_max_it 5	
 
 # ~~~~~~~~~~~
 # ~~~~~~~~~~~
@@ -156,6 +194,30 @@ run_tests_load_parallel:
 	@echo ""
 	@echo "Test AIRG with GMRES polynomials in indefinite problem with zero diagonals in parallel"
 	$(MPIEXEC) -n 2 ./ex6 -f data/e05r0100_petsc -b_in_f 0 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 26		
+#
+	@echo ""
+	@echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 2nd order in parallel"
+	$(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 2
+	@echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 3rd order in parallel"
+	$(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 3		
+#
+	@echo ""
+	@echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders in parallel"
+	@set -e; for order in 0 1 2 3 4 5 6; do \
+		echo "--- Testing order = $$order ---"; \
+		$(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \
+	done
+	@echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders and fixed sparsity in parallel"
+	@set -e; for order in 2 3 4 5 6; do \
+		echo "--- Testing order = $$order ---"; \
+		for sparsity in $$(seq 1 $$(($$order - 1))); do \
+			echo "  --- Testing sparsity order = $$sparsity ---"; \
+			$(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$sparsity; \
+		done; \
+	done
+#
+	@echo "Test Newton AIRG with GMRES polynomials for hyperbolic streaming problem, matrix-free smoothing in parallel"
+	$(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -pc_air_matrix_free_polys -ksp_max_it 5	
 
 # ~~~~~~~~~~~
 # ~~~~~~~~~~~
@@ -260,6 +322,32 @@ run_tests_no_load_serial:
 	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 10 -pc_air_inverse_sparsity_order 0
 	@echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change with 0th order fixed sparsity"
 	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0
+# 
+	@echo ""
+	@echo "Test AIRG Newton with 0th order GMRES polynomials with PC reused with no sparsity change"
+	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 0
+	@echo "Test AIRG Newton with 0th order GMRES polynomials with PC regenerated with no sparsity change"
+	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 0	
+# 
+	@echo ""
+	@echo "Test AIRG Newton with 1st order GMRES polynomials with PC reused with no sparsity change"
+	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 1
+	@echo "Test AIRG Newton with 1st order GMRES polynomials with PC regenerated with no sparsity change"
+	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 1		
+# 
+	@echo ""
+	@echo "Test AIRG Newton with 2nd order GMRES polynomials with PC reused with no sparsity change"
+	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2
+	@echo "Test AIRG Newton with 2nd order GMRES polynomials with PC regenerated with no sparsity change"
+	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2
+	@echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change"
+	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2		
+# 
+	@echo ""
+	@echo "Test AIRG Newton with GMRES polynomials with PC reused with no sparsity change with 0th order fixed sparsity"
+	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_inverse_sparsity_order 0
+	@echo "Test AIRG Newton with GMRES polynomials with PC regenerated with no sparsity change with 0th order fixed sparsity"
+	./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0	
 # 
 	@echo ""
 	@echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change and polynomial coeffs stored"
@@ -351,7 +439,32 @@ run_tests_no_load_serial:
 	 -pc_air_improve_w_its 3 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power
 	@echo "Test improving Z with PC regenerated with no sparsity change with 1 iteration"
 	./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -pc_air_one_point_classical_prolong 0 \
-	 -pc_air_improve_w_its 1 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power	 	
+	 -pc_air_improve_w_its 1 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power
+#
+	@echo ""
+	@echo "Test Newton AIRG on advection for for different orders"
+	@set -e; for order in 0 1 2 3 4 5 6; do \
+		echo "--- Testing order = $$order ---"; \
+		./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \
+		-pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 6; \
+	done
+	@echo "Test Newton AIRG on advection for for different orders and fixed sparsity"
+	@set -e; for order in 2 3 4 5 6; do \
+		echo "--- Testing order = $$order ---"; \
+		for sparsity in $$(seq 1 $$(($$order - 1))); do \
+			echo "  --- Testing sparsity order = $$sparsity ---"; \
+				./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \
+				-pc_air_inverse_sparsity_order $$sparsity -ksp_norm_type unpreconditioned -ksp_max_it 5; \
+		done; \
+	done
+#
+	@echo ""
+	@echo "Test Newton GMRES polynomial on a diagonal matrix"
+	./mat_diag -ksp_max_it 1
+	@echo "Test Arnoldi GMRES polynomial on a diagonal matrix"
+	./mat_diag -pc_pflareinv_type arnoldi -ksp_max_it 1
+	@echo "Test Power GMRES polynomial on a diagonal matrix"
+	./mat_diag -pc_pflareinv_type power -ksp_max_it 1		
 #	 
 # ~~~~~~~~~~~~~~~~~~~~~~~	 
 # Include kokkos examples	 
@@ -484,6 +597,41 @@ run_tests_no_load_parallel:
 	@echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change in parallel with 0th order fixed sparsity"
 	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0 \
 	 -pc_air_a_drop 1e-3 -pc_air_inverse_type power
+# 
+	@echo ""
+	@echo "Test AIRG Newton with 0th order GMRES polynomials with PC reused with no sparsity change in parallel"
+	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_poly_order 0 \
+	 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton
+	@echo "Test AIRG Newton with 0th order GMRES polynomials with PC regenerated with no sparsity change in parallel"
+	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 0 \
+	 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton
+# 
+	@echo ""
+	@echo "Test AIRG Newton with 1st order GMRES polynomials with PC reused with no sparsity change in parallel"
+	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_poly_order 1 \
+	 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton
+	@echo "Test AIRG Newton with 1st order GMRES polynomials with PC regenerated with no sparsity change in parallel"
+	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 1 \
+	 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton	 
+# 
+	@echo ""
+	@echo "Test AIRG Newton with 2nd order GMRES polynomials with PC reused with no sparsity change in parallel"
+	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 \
+	 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton
+	@echo "Test AIRG Newton with 2nd order GMRES polynomials with PC regenerated with no sparsity change in parallel"
+	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 \
+	 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton
+	@echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change in parallel"
+	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 \
+	 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton	
+# 
+	@echo ""
+	@echo "Test AIRG Newton with GMRES polynomials with PC reused with no sparsity change in parallel with 0th order fixed sparsity"
+	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_inverse_sparsity_order 0 \
+	 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton	
+	@echo "Test AIRG Newton with GMRES polynomials with PC regenerated with no sparsity change in parallel with 0th order fixed sparsity"
+	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0 \
+	 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton	  	
 # 
 	@echo ""
 	@echo "Test solving isotropic diffusion with fast coarsening and near-nullspace in parallel"
@@ -514,6 +662,23 @@ run_tests_no_load_parallel:
 	@echo "Test improving W with PC regenerated with no sparsity change in parallel"
 	$(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -pc_air_one_point_classical_prolong 0 \
 	 -pc_air_improve_w_its 3 -ksp_max_it 3	-pc_air_a_drop 1e-3 -pc_air_inverse_type power
+#
+	@echo ""
+	@echo "Test Newton AIRG on advection for for different orders in parallel"
+	@set -e; for order in 0 1 2 3 4 5 6; do \
+		echo "--- Testing order = $$order ---"; \
+		$(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \
+		-pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 6; \
+	done
+	@echo "Test Newton AIRG on advection for for different orders and fixed sparsity in parallel"
+	@set -e; for order in 2 3 4 5 6; do \
+		echo "--- Testing order = $$order ---"; \
+		for sparsity in $$(seq 1 $$(($$order - 1))); do \
+			echo "  --- Testing sparsity order = $$sparsity ---"; \
+				$(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \
+				-pc_air_inverse_sparsity_order $$sparsity -ksp_norm_type unpreconditioned -ksp_max_it 5; \
+		done; \
+	done
 
 #	 
 # ~~~~~~~~~~~~~~~~~~~~~~~	 
@@ -535,7 +700,7 @@ run_tests_medium_serial:
 
 #
 	@echo ""
-	@for size in $$(seq 100 20 200); do \
+	@set -e; for size in $$(seq 100 20 200); do \
 		echo "--- Testing size = $$size x $$size ---"; \
 		./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \
 	    -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \
@@ -543,7 +708,7 @@ run_tests_medium_serial:
 #
 	@echo ""
 	@echo "--- Running scaling study on adv_diff_2d ---"
-	@for size in 100 200 400 800; do \
+	@set -e; for size in 100 200 400 800; do \
 		echo "--- Testing size = $$size x $$size ---"; \
 		./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \
 	    -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \
@@ -555,7 +720,7 @@ run_tests_medium_parallel:
 
 #
 	@echo ""
-	@for size in $$(seq 100 20 200); do \
+	@set -e; for size in $$(seq 100 20 200); do \
 		echo "--- Testing size = $$size x $$size ---"; \
 		$(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \
 	    -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \
@@ -563,7 +728,7 @@ run_tests_medium_parallel:
 #
 	@echo ""
 	@echo "--- Running scaling study on adv_diff_2d in parallel ---"
-	@for size in 100 200 400 800; do \
+	@set -e; for size in 100 200 400 800; do \
 		echo "--- Testing size = $$size x $$size ---"; \
 		$(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \
 	    -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \
diff --git a/tests/data/1138_bus b/tests/data/1138_bus
new file mode 100644
index 00000000..dc745535
Binary files /dev/null and b/tests/data/1138_bus differ
diff --git a/tests/data/1138_bus.info b/tests/data/1138_bus.info
new file mode 100644
index 00000000..270767f1
--- /dev/null
+++ b/tests/data/1138_bus.info
@@ -0,0 +1 @@
+-matload_block_size 1
diff --git a/tests/ex12f_gmres_poly.F90 b/tests/ex12f_gmres_poly.F90
new file mode 100644
index 00000000..c0d188e0
--- /dev/null
+++ b/tests/ex12f_gmres_poly.F90
@@ -0,0 +1,182 @@
+!
+      program main
+#include <petsc/finclude/petscksp.h>
+      use petscksp
+#include "finclude/pflare.h"      
+      implicit none
+
+!   Comparison between different forms of GMRES polynomials
+
+      PetscErrorCode  ierr
+      PetscInt m,n,mlocal,nlocal
+      PetscBool  flg, check, no_power
+      PetscReal      norm_power, norm_rhs, norm_arnoldi, norm_newton
+      PetscReal :: norm_diff_one, norm_diff_two
+      Vec              x,b,u, b_diff_type
+      Mat              A, A_diff_type
+      character*(128)  f
+      PetscViewer      fd
+      KSP              ksp
+      PC               pc
+      KSPConvergedReason reason
+      PetscInt, parameter :: one=1
+      MatType :: mtype, mtype_input
+
+      call PetscInitialize(PETSC_NULL_CHARACTER,ierr)
+      if (ierr .ne. 0) then
+        print*,'Unable to initialize PETSc'
+        stop
+      endif
+
+! Read in matrix and RHS
+      call PetscOptionsGetString(PETSC_NULL_OPTIONS,                    &
+     &        PETSC_NULL_CHARACTER,'-f',f,flg,ierr)
+      call PetscViewerBinaryOpen(PETSC_COMM_WORLD,f,FILE_MODE_READ,     &
+     &     fd,ierr)
+      no_power = PETSC_FALSE
+      ! Our CI has an intel pipeline and the intel MPI breaks with the power basis
+      ! so we can disable the power basis test with a command line option
+      call PetscOptionsGetBool(PETSC_NULL_OPTIONS,PETSC_NULL_CHARACTER, &
+               '-no_power', check,flg,ierr)  
+      if (flg) no_power = check     
+
+      call MatCreate(PETSC_COMM_WORLD,A,ierr)
+      call MatLoad(A,fd,ierr)
+
+      ! Get information about matrix
+      call MatGetSize(A,m,n,ierr)
+      call MatGetLocalSize(A,mlocal,nlocal,ierr)
+
+      call VecCreate(PETSC_COMM_WORLD,b,ierr)
+      call VecLoad(b,fd,ierr)
+      call PetscViewerDestroy(fd,ierr)
+
+      ! Test and see if the user wants us to use a different matrix type
+      ! with -mat_type on the command line
+      ! This lets us easily test our cpu and kokkos versions through our CI
+      call MatCreateFromOptions(PETSC_COMM_WORLD,PETSC_NULL_CHARACTER,&
+               one,mlocal,nlocal,m,n,A_diff_type,ierr)
+      call MatAssemblyBegin(A_diff_type,MAT_FINAL_ASSEMBLY,ierr)
+      call MatAssemblyEnd(A_diff_type,MAT_FINAL_ASSEMBLY,ierr)               
+      
+      call MatGetType(A, mtype, ierr)
+      call MatGetType(A_diff_type, mtype_input, ierr)
+
+      if (mtype /= mtype_input) then
+         ! Doesn't seem like there is a converter to kokkos
+         ! So instead we just copy into the empty A_diff_type
+         ! This will be slow as its not preallocated, but this is just for testing
+         call MatCopy(A, A_diff_type, DIFFERENT_NONZERO_PATTERN, ierr)
+         call MatDestroy(A,ierr)
+         A = A_diff_type
+
+         ! Mat and vec types have to match
+         call VecCreateFromOptions(PETSC_COMM_WORLD,PETSC_NULL_CHARACTER, & 
+                  one,nlocal,n,b_diff_type,ierr)
+         call VecCopy(b,b_diff_type,ierr)
+         call VecDestroy(b,ierr)
+         b = b_diff_type
+ 
+      else
+         call MatDestroy(A_diff_type,ierr)
+      end if
+
+      ! Set up solution
+      call VecDuplicate(b,x,ierr)
+      call VecDuplicate(b,u,ierr)
+
+      ! Register the pflare types
+      call PCRegister_PFLARE()
+
+      call VecNorm(b,NORM_2,norm_rhs,ierr)
+
+      ! ~~~~~~~~~~~~~
+      ! Do a solve with the Arnoldi basis
+      ! ~~~~~~~~~~~~~
+      call KSPCreate(PETSC_COMM_WORLD,ksp,ierr)
+      call KSPSetOperators(ksp,A,A,ierr)
+      call KSPGetPC(ksp, pc, ierr)       
+      call PCSetType(pc, PCAIR, ierr)      
+      call PCAIRSetInverseType(pc, PFLAREINV_ARNOLDI, ierr)
+      call KSPSetPC(ksp, pc, ierr)
+      call KSPSetFromOptions(ksp,ierr)
+      call VecSet(x, 0d0, ierr)
+      call KSPSolve(ksp,b,x,ierr)
+      call KSPGetConvergedReason(ksp,reason,ierr)      
+      if (reason%v < 0) then
+         error stop 1
+      end if
+      ! Compute the residual
+      call MatMult(A,x,u,ierr)
+      call VecAXPY(u,-1d0,b,ierr)
+      call VecNorm(u,NORM_2,norm_arnoldi,ierr)
+      norm_arnoldi = norm_arnoldi/norm_rhs
+
+      ! ~~~~~~~~~~~~~
+      ! Now do a solve with the Power basis
+      ! ~~~~~~~~~~~~~      
+      if (.NOT. no_power) then
+         call PCAIRSetInverseType(pc, PFLAREINV_POWER, ierr)
+
+         call VecSet(x, 0d0, ierr)
+         call KSPSolve(ksp,b,x,ierr)
+         call KSPGetConvergedReason(ksp,reason,ierr)      
+         if (reason%v < 0) then
+            error stop 1
+         end if
+         ! Compute the residual
+         call MatMult(A,x,u,ierr)
+         call VecAXPY(u,-1d0,b,ierr)
+         call VecNorm(u,NORM_2,norm_power,ierr)
+         norm_power = norm_power/norm_rhs
+      end if
+
+      ! ~~~~~~~~~~~~~
+      ! Now do a solve with the Newton basis
+      ! ~~~~~~~~~~~~~         
+      call PCAIRSetInverseType(pc, PFLAREINV_NEWTON, ierr)   
+
+      call VecSet(x, 0d0, ierr)
+      call KSPSolve(ksp,b,x,ierr)
+      call KSPGetConvergedReason(ksp,reason,ierr)      
+      if (reason%v < 0) then
+         error stop 1
+      end if
+      ! Compute the residual
+      call MatMult(A,x,u,ierr)
+      call VecAXPY(u,-1d0,b,ierr)
+      call VecNorm(u,NORM_2,norm_newton,ierr)
+      norm_newton = norm_newton/norm_rhs
+      call KSPDestroy(ksp,ierr)
+
+      ! ~~~~~~~~~~~~~
+      ! Now check all the residuals are the same
+      ! For low order polynomials on the diagonally dominant
+      ! A_ff on each level they should be basically identical and hence
+      ! we should have almost no difference in the resulting residual
+      ! ~~~~~~~~~~~~~
+      norm_diff_one = abs(norm_arnoldi - norm_newton)/norm_arnoldi
+      if (norm_diff_one > 1e-9) then
+         print *, "Residuals differ between polynomial bases!", norm_diff_one
+         print *, "Newton basis residual:  ", norm_newton
+         print *, "Arnoldi basis residual:   ", norm_arnoldi
+         error stop 1
+      end if
+      if (.NOT. no_power) then
+         norm_diff_two = abs(norm_arnoldi - norm_power)/norm_arnoldi
+         if (norm_diff_two > 1e-9) then
+            print *, "Residuals differ between polynomial bases!", norm_diff_two
+            print *, "Power basis residual:  ", norm_power
+            print *, "Arnoldi basis residual: ", norm_arnoldi
+            error stop 1
+         end if      
+      end if
+
+      call VecDestroy(b,ierr)
+      call VecDestroy(x,ierr)
+      call VecDestroy(u,ierr)
+      call MatDestroy(A,ierr)
+
+      call PetscFinalize(ierr)
+
+      end
\ No newline at end of file
diff --git a/tests/mat_diag.F90 b/tests/mat_diag.F90
new file mode 100644
index 00000000..1d6a54af
--- /dev/null
+++ b/tests/mat_diag.F90
@@ -0,0 +1,119 @@
+#include <petsc/finclude/petscksp.h>
+#include "finclude/pflare.h"
+      use petscksp
+      implicit none
+
+      ! Test that the gmres polynomials can handle small solve of diagonal matrix
+      ! We leave the polynomial order here as the default (which is 6), despite
+      ! the fact that much lower polynomial order is an exact solve in this case
+      ! This tests that the various gmres polynomial methods correctly
+      ! identify we only need up to lower order
+
+      PetscErrorCode :: ierr
+      Mat :: A
+      PetscInt :: m, n, nnzs
+      PetscInt, parameter :: one = 1, two = 2, three = 3, zero = 0
+      Vec :: x,b
+      KSP :: ksp
+      PC :: pc
+      PetscBool :: flg
+      KSPConvergedReason reason
+      PetscRandom rctx
+
+      call PetscInitialize(PETSC_NULL_CHARACTER,ierr)    
+      ! Register the pflare types
+      call PCRegister_PFLARE()      
+      
+      m      = 10
+      n      = 10
+      call PetscOptionsGetInt(PETSC_NULL_OPTIONS,PETSC_NULL_CHARACTER,'-m',m,flg,ierr)
+      call PetscOptionsGetInt(PETSC_NULL_OPTIONS,PETSC_NULL_CHARACTER,'-n',n,flg,ierr)
+
+      ! Create matrix
+      call MatCreate(PETSC_COMM_WORLD,A,ierr)
+      call MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,m*n,m*n,ierr)
+      call MatSetFromOptions(A,ierr)
+      nnzs = m;
+      call MatSeqAIJSetPreallocation(A, nnzs, PETSC_NULL_INTEGER_ARRAY, ierr)
+      call MatMPIAIJSetPreallocation(A, nnzs, PETSC_NULL_INTEGER_ARRAY, nnzs, PETSC_NULL_INTEGER_ARRAY, ierr)
+      call MatSetUp(A,ierr)
+      call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr)
+      call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr)
+      call MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE, ierr)
+
+      call MatCreateVecs(A,b,x,ierr)
+      call VecSet(x, 0d0, ierr)
+
+      ! Random rhs
+      call PetscRandomCreate(PETSC_COMM_WORLD, rctx, ierr)
+      call PetscRandomSetFromOptions(rctx, ierr)
+      call VecSetRandom(b, rctx, ierr)
+      call PetscRandomDestroy(rctx, ierr)      
+
+      ! ~~~~~~~~~~~~~~
+      ! Set constant diagonal values in matrix
+      ! In Newton form should only need a single root 
+      ! (ie a 0th order polynomial) for an exact solve
+      ! Starting with the identity, the inverse should also be the identity
+      ! ~~~~~~~~~~~~~~
+      call MatShift(A, 1d0, ierr)
+
+      call KSPCreate(PETSC_COMM_WORLD,ksp,ierr)
+      call KSPSetOperators(ksp,A,A,ierr)
+      call KSPGetPC(ksp,pc,ierr)
+      ! Set newton gmres polynomial as PC
+      call PCSetType(pc, PCPFLAREINV, ierr)
+      call PCPFLAREINVSetType(pc, PFLAREINV_NEWTON, ierr)
+      call KSPSetPC(ksp, pc, ierr)       
+      call KSPSetFromOptions(ksp,ierr)   
+      call KSPSetUp(ksp,ierr)
+
+      ! Do the solve
+      call KSPSolve(ksp,b,x,ierr)
+      call KSPGetConvergedReason(ksp, reason, ierr)
+      if (reason%v < 0) then
+         error stop 1
+      end if    
+      
+      ! ~~~~~~~~~~~~~~
+      ! Instead now set the diagonal to 1.5
+      ! In Newton form should only need a single root 
+      ! ~~~~~~~~~~~~~~
+      call MatShift(A, 0.5d0, ierr)
+      call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr)
+      call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr)      
+      call VecSet(x, 0d0, ierr)    
+
+      ! Do another solve - this will automatically trigger the setup as the matrix
+      ! has changed
+      call KSPSolve(ksp,b,x,ierr)
+      call KSPGetConvergedReason(ksp, reason, ierr)
+      if (reason%v < 0) then
+         error stop 1
+      end if        
+      
+      ! ~~~~~~~~~~~~~~
+      ! Instead now have two different constant values in the diagonal
+      ! In Newton form should only need two roots
+      ! (ie a 1st order polynomial) for an exact solve
+      ! ~~~~~~~~~~~~~~
+      ! Set one of the values to 2.5
+      call MatSetValue(A, zero, zero, 2.5d0, INSERT_VALUES, ierr)
+      call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr)
+      call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr)      
+      call VecSet(x, 0d0, ierr)    
+
+      ! Do another solve - this will automatically trigger the setup as the matrix
+      ! has changed
+      call KSPSolve(ksp,b,x,ierr)
+      call KSPGetConvergedReason(ksp, reason, ierr)
+      if (reason%v < 0) then
+         error stop 1
+      end if       
+
+      call MatDestroy(A, ierr)
+      call VecDestroy(b, ierr)
+      call VecDestroy(x, ierr)
+      call KSPDestroy(ksp, ierr)
+      call PetscFinalize(ierr)
+      end
\ No newline at end of file