From 65bf9c33d1c0b9d34e69072c6f34179ac92492a8 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 15 Jan 2026 18:03:06 +0000 Subject: [PATCH 01/41] Enable the use of 0th order assembled newton polynomial form of gmres polynomial --- src/Approx_Inverse_Setup.F90 | 6 +- src/Gmres_Poly_Newton.F90 | 168 ++++++++++++++++++++++++++++------- tests/Makefile | 14 +++ 3 files changed, 150 insertions(+), 38 deletions(-) diff --git a/src/Approx_Inverse_Setup.F90 b/src/Approx_Inverse_Setup.F90 index 35559ca..f83f6e4 100644 --- a/src/Approx_Inverse_Setup.F90 +++ b/src/Approx_Inverse_Setup.F90 @@ -303,13 +303,9 @@ subroutine finish_approximate_inverse(matrix, inverse_type, & ! Gmres polynomial with newton basis else if (inverse_type == PFLAREINV_NEWTON .OR. inverse_type == PFLAREINV_NEWTON_NO_EXTRA) then - if (.NOT. matrix_free) then - print *, "GMRES polynomial with Newton basis must be applied matrix-free" - call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) - end if - call build_gmres_polynomial_newton_inverse(matrix, poly_order, & coefficients, & + inverse_sparsity_order, matrix_free, reuse_mat, reuse_submatrices, & inv_matrix) ! Neumann polynomial diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 87e5f8d..4b81ae2 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -674,6 +674,7 @@ end subroutine petsc_matvec_gmres_newton_mf_residual subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & coefficients, & + poly_sparsity_order, matrix_free, reuse_mat, reuse_submatrices, & inv_matrix) ! Builds a matrix which is an approximation to the inverse of a matrix using the @@ -684,7 +685,10 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & type(tMat), intent(in) :: matrix integer, intent(in) :: poly_order PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients - type(tMat), intent(inout) :: inv_matrix + integer, intent(in) :: poly_sparsity_order + logical, intent(in) :: matrix_free + type(tMat), intent(inout) :: reuse_mat, inv_matrix + type(tMat), dimension(:), pointer, intent(inout) :: reuse_submatrices ! Local variables PetscInt :: global_rows, global_cols, local_rows, local_cols @@ -692,6 +696,7 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & PetscErrorCode :: ierr MPIU_Comm :: MPI_COMM_MATRIX type(mat_ctxtype), pointer :: mat_ctx=>null() + logical :: reuse_triggered ! ~~~~~~ @@ -708,47 +713,144 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! ~~~~~~~ ! Just build a matshell that applies our polynomial matrix-free ! ~~~~~~~ + if (matrix_free) then - ! If not re-using - if (PetscObjectIsNull(inv_matrix)) then + ! If not re-using + if (PetscObjectIsNull(inv_matrix)) then - ! Have to dynamically allocate this - allocate(mat_ctx) + ! Have to dynamically allocate this + allocate(mat_ctx) - ! We pass in the polynomial coefficients as the context - call MatCreateShell(MPI_COMM_MATRIX, local_rows, local_cols, global_rows, global_cols, & - mat_ctx, inv_matrix, ierr) - ! The subroutine petsc_matvec_gmres_newton_mf applies the polynomial inverse - call MatShellSetOperation(inv_matrix, & - MATOP_MULT, petsc_matvec_gmres_newton_mf, ierr) + ! We pass in the polynomial coefficients as the context + call MatCreateShell(MPI_COMM_MATRIX, local_rows, local_cols, global_rows, global_cols, & + mat_ctx, inv_matrix, ierr) + ! The subroutine petsc_matvec_gmres_newton_mf applies the polynomial inverse + call MatShellSetOperation(inv_matrix, & + MATOP_MULT, petsc_matvec_gmres_newton_mf, ierr) - call MatAssemblyBegin(inv_matrix, MAT_FINAL_ASSEMBLY, ierr) - call MatAssemblyEnd(inv_matrix, MAT_FINAL_ASSEMBLY, ierr) - ! Have to make sure to set the type of vectors the shell creates - call ShellSetVecType(matrix, inv_matrix) + call MatAssemblyBegin(inv_matrix, MAT_FINAL_ASSEMBLY, ierr) + call MatAssemblyEnd(inv_matrix, MAT_FINAL_ASSEMBLY, ierr) + ! Have to make sure to set the type of vectors the shell creates + call ShellSetVecType(matrix, inv_matrix) + + ! Create temporary vectors we use during application + ! Make sure to use matrix here to get the right type (as the shell doesn't know about gpus) + call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_TEMP), PETSC_NULL_VEC, ierr) + call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_RHS), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) + + ! Reusing + else + call MatShellGetContext(inv_matrix, mat_ctx, ierr) + + end if + + mat_ctx%real_roots => coefficients(:, 1) + mat_ctx%imag_roots => coefficients(:, 2) + ! Now because the context reset deallocates the coefficient pointer + ! we want to make sure we don't leak memory, so we use pointer remapping here + ! to turn the 2D coefficient pointer into a 1D that we can store in mat_ctx%coefficients + ! and then the deallocate on mat_ctx%coefficients should still delete all the memory + mat_ctx%coefficients(1:2*size(coefficients,1)) => coefficients(:, :) + ! This is the matrix whose inverse we are applying (just copying the pointer here) + mat_ctx%mat = matrix - ! Create temporary vectors we use during application - ! Make sure to use matrix here to get the right type (as the shell doesn't know about gpus) - call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_TEMP), PETSC_NULL_VEC, ierr) - call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_RHS), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) + ! We're done + return + endif - ! Reusing - else - call MatShellGetContext(inv_matrix, mat_ctx, ierr) + ! ~~~~~~~~~~~~ + ! If we're here then we want an assembled approximate inverse + ! ~~~~~~~~~~~~ + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + + ! If we're zeroth order poly this is trivial as it's just 1/theta_1 I + if (poly_order == 0) then + + call build_gmres_polynomial_newton_inverse_0th_order(matrix, poly_order, coefficients, & + inv_matrix) + + ! Then just return + return + + ! For poly_order 1 and poly_sparsity_order 1 this is easy + else if (poly_order == 1 .AND. poly_sparsity_order == 1) then + + ! Duplicate & copy the matrix, but ensure there is a diagonal present + call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, inv_matrix) + + ! Flags to prevent reductions when assembling (there are assembles in the shift) + call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) + call MatSetOption(inv_matrix, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) + call MatSetOption(inv_matrix, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) + + ! ! We want 1/theta_1 (I - A/theta_1) + ! ! result = -A_ff/theta_1^2 + ! ! We know if we have only a first order polynomial the first root + ! ! is purely real (as complex roots come in conjugate pairs) + ! call MatScale(inv_matrix, -1d0/(coefficients(1, 1))**2, ierr) + + ! ! result = -A_ff/theta_1^2 + 1/theta_1 I + ! ! Don't need an assemble as there is one called in this + ! call MatShift(inv_matrix, 1d0/coefficients(1, 1), ierr) + + ! Then just return + return end if + + + + + end subroutine build_gmres_polynomial_newton_inverse + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine build_gmres_polynomial_newton_inverse_0th_order(matrix, poly_order, coefficients, & + inv_matrix) + + ! Specific 0th order inverse + + ! ~~~~~~ + type(tMat), intent(in) :: matrix + integer, intent(in) :: poly_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: inv_matrix + + ! Local variables + integer :: errorcode + PetscErrorCode :: ierr + logical :: reuse_triggered + type(tVec) :: diag_vec + + ! ~~~~~~ + + if (poly_order /= 0) then + print *, "This is a 0th order inverse, but poly_order is not 0" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + + ! Let's create a matrix to represent the inverse diagonal + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + + if (.NOT. reuse_triggered) then + call MatCreateVecs(matrix, PETSC_NULL_VEC, diag_vec, ierr) + else + call MatDiagonalGetDiagonal(inv_matrix, diag_vec, ierr) + end if + + ! Must be real as we only have one coefficient + call VecSet(diag_vec, 1d0/coefficients(1, 1), ierr) + + ! We may be reusing with the same sparsity + if (.NOT. reuse_triggered) then + ! The matrix takes ownership of diag_vec and increases ref counter + call MatCreateDiagonal(diag_vec, inv_matrix, ierr) + call VecDestroy(diag_vec, ierr) + else + call MatDiagonalRestoreDiagonal(inv_matrix, diag_vec, ierr) + end if - mat_ctx%real_roots => coefficients(:, 1) - mat_ctx%imag_roots => coefficients(:, 2) - ! Now because the context reset deallocates the coefficient pointer - ! we want to make sure we don't leak memory, so we use pointer remapping here - ! to turn the 2D coefficient pointer into a 1D that we can store in mat_ctx%coefficients - ! and then the deallocate on mat_ctx%coefficients should still delete all the memory - mat_ctx%coefficients(1:2*size(coefficients,1)) => coefficients(:, :) - ! This is the matrix whose inverse we are applying (just copying the pointer here) - mat_ctx%mat = matrix - - end subroutine build_gmres_polynomial_newton_inverse + end subroutine build_gmres_polynomial_newton_inverse_0th_order ! ------------------------------------------------------------------------------------------------------------------------------- diff --git a/tests/Makefile b/tests/Makefile index 5dd2e45..e5a119f 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -260,6 +260,12 @@ run_tests_no_load_serial: ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 10 -pc_air_inverse_sparsity_order 0 @echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change with 0th order fixed sparsity" ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0 +# + @echo "" + @echo "Test AIRG Newton with 0th order GMRES polynomials with PC reused with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 0 + @echo "Test AIRG Newton with 0th order GMRES polynomials with PC regenerated with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 0 # @echo "" @echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change and polynomial coeffs stored" @@ -484,6 +490,14 @@ run_tests_no_load_parallel: @echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change in parallel with 0th order fixed sparsity" $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0 \ -pc_air_a_drop 1e-3 -pc_air_inverse_type power +# + @echo "" + @echo "Test AIRG Newton with 0th order GMRES polynomials with PC reused with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_poly_order 0 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with 0th order GMRES polynomials with PC regenerated with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 0 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton # @echo "" @echo "Test solving isotropic diffusion with fast coarsening and near-nullspace in parallel" From 90ca1d1bba4923af0af0bf0e38effe88acaa8bc2 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 15 Jan 2026 20:07:40 +0000 Subject: [PATCH 02/41] Enable the use of 1st order assembled newton polynomial form of gmres polynomial --- src/AIR_MG_Setup.F90 | 4 ++- src/Gmres_Poly_Newton.F90 | 59 +++++++++++++++++++++++++++++++-------- src/PCPFLAREINV.c | 9 ++---- tests/Makefile | 16 ++++++++++- 4 files changed, 68 insertions(+), 20 deletions(-) diff --git a/src/AIR_MG_Setup.F90 b/src/AIR_MG_Setup.F90 index 180d0f0..f35bdfa 100644 --- a/src/AIR_MG_Setup.F90 +++ b/src/AIR_MG_Setup.F90 @@ -424,8 +424,10 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) end if ! Convert Aff to a matdiagonal type - ! Haven't rewritten sai to take advantage of matdiagonal + ! Haven't rewritten some inverse types to take advantage of matdiagonal if (aff_diag .AND. & + inverse_type_aff /= PFLAREINV_NEWTON .AND. & + inverse_type_aff /= PFLAREINV_NEWTON_NO_EXTRA .AND. & inverse_type_aff /= PFLAREINV_SAI .AND. & inverse_type_aff /= PFLAREINV_ISAI) then diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 4b81ae2..d4f4e27 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -697,6 +697,7 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & MPIU_Comm :: MPI_COMM_MATRIX type(mat_ctxtype), pointer :: mat_ctx=>null() logical :: reuse_triggered + PetscReal :: square_sum ! ~~~~~~ @@ -761,7 +762,11 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! ~~~~~~~~~~~~ ! If we're here then we want an assembled approximate inverse ! ~~~~~~~~~~~~ - reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + + ! For the 0th and 1st order assembled polynomial we just combine the coefficients + ! to get the mononomial form and assemble it, which should be stable for such low order + ! For higher order we use the actual Newton form ! If we're zeroth order poly this is trivial as it's just 1/theta_1 I if (poly_order == 0) then @@ -783,22 +788,54 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & call MatSetOption(inv_matrix, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) call MatSetOption(inv_matrix, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) - ! ! We want 1/theta_1 (I - A/theta_1) - ! ! result = -A_ff/theta_1^2 - ! ! We know if we have only a first order polynomial the first root - ! ! is purely real (as complex roots come in conjugate pairs) - ! call MatScale(inv_matrix, -1d0/(coefficients(1, 1))**2, ierr) + ! We only have two coefficients, so they are either both real or complex conjugates + ! If real + if (coefficients(1,2) == 0d0) then + + ! Have to be careful here, as we may be first order, but the second eigenvaule + ! might have been set to zero thanks to the rank reducing solve + ! So we just check if the second imaginary part is zero and if it is + ! we just compute a 0th order inverse - annoyingly we can't call + ! build_gmres_polynomial_newton_inverse_0th_order as that builds a MATDIAGONAL + ! and in the tests there is a problem where we reuse the sparsity, in the first + ! solve we don't have a zero coefficient but in the second solve we do + ! So the mat type needs to remain consistent + ! This can't happen in the complex case + if (coefficients(2,1) == 0d0) then + + ! Set to zero + call MatScale(inv_matrix, 0d0, ierr) + ! Then add in the 0th order inverse + call MatShift(inv_matrix, 1d0/coefficients(1,1), ierr) + + ! Then just return + return + end if - ! ! result = -A_ff/theta_1^2 + 1/theta_1 I - ! ! Don't need an assemble as there is one called in this - ! call MatShift(inv_matrix, 1d0/coefficients(1, 1), ierr) + ! result = -A_ff/(theta_1 * theta_2) + call MatScale(inv_matrix, -1d0/(coefficients(1, 1) * coefficients(2, 1)), ierr) + + ! result = I * (1/theta_1 + 1/theta_2) - A_ff/(theta_1 * theta_2) + ! Don't need an assemble as there is one called in this + call MatShift(inv_matrix, 1d0/(coefficients(1, 1)) + 1d0/(coefficients(2, 1)), ierr) + + ! Complex conjugate roots, a +- ib + else + ! a^2 + b^2 + square_sum = coefficients(1,1)**2 + coefficients(1,2)**2 + + ! Complex conjugate roots + ! result = -A_ff / (a^2 + b^2) + call MatScale(inv_matrix, -1d0/square_sum, ierr) + ! result = 2a/(a^2 + b^2) I - A_ff / (a^2 + b^2) + ! Don't need an assemble as there is one called in this + call MatShift(inv_matrix, 2d0 * coefficients(1,1)/square_sum, ierr) + end if ! Then just return return end if - - end subroutine build_gmres_polynomial_newton_inverse diff --git a/src/PCPFLAREINV.c b/src/PCPFLAREINV.c index b7f3a8e..8b6826b 100644 --- a/src/PCPFLAREINV.c +++ b/src/PCPFLAREINV.c @@ -18,8 +18,8 @@ PETSC_EXTERN void calculate_and_build_approximate_inverse_c(Mat *input_mat, Pets // // PFLAREINV_POWER - GMRES polynomial with the power basis // PFLAREINV_ARNOLDI - GMRES polynomial with the arnoldi basis -// PFLAREINV_NEWTON - GMRES polynomial with the newton basis with extra roots for stability - can only be used matrix-free atm -// PFLAREINV_NEWTON_NO_EXTRA - GMRES polynomial with the newton basis without extra roots - can only be used matrix-free atm +// PFLAREINV_NEWTON - GMRES polynomial with the newton basis with extra roots for stability +// PFLAREINV_NEWTON_NO_EXTRA - GMRES polynomial with the newton basis without extra roots // PFLAREINV_NEUMANN - Neumann polynomial // PFLAREINV_SAI - SAI - cannot be used matrix-free atm // PFLAREINV_ISAI - Incomplete SAI - cannot be used matrix-free atm @@ -335,11 +335,6 @@ static PetscErrorCode PCSetUp_PFLAREINV_c(PC pc) // ~~~~~~~ PetscCall(PCPFLAREINVGetType(pc, &type)); - // Newton has to be matrix free - if (type == PFLAREINV_NEWTON || type == PFLAREINV_NEWTON_NO_EXTRA) - { - PetscCheck(inv_data->matrix_free, comm, PETSC_ERR_ARG_WRONGSTATE, "GMRES polynomial with Newton basis must be applied matrix-free"); - } // SAI/ISAI can't be matrix free if (type == PFLAREINV_SAI || type == PFLAREINV_ISAI) { diff --git a/tests/Makefile b/tests/Makefile index e5a119f..32f49de 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -266,6 +266,12 @@ run_tests_no_load_serial: ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 0 @echo "Test AIRG Newton with 0th order GMRES polynomials with PC regenerated with no sparsity change" ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 0 +# + @echo "" + @echo "Test AIRG Newton with 1st order GMRES polynomials with PC reused with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 1 + @echo "Test AIRG Newton with 1st order GMRES polynomials with PC regenerated with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 1 # @echo "" @echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change and polynomial coeffs stored" @@ -497,7 +503,15 @@ run_tests_no_load_parallel: -pc_air_a_drop 1e-3 -pc_air_inverse_type newton @echo "Test AIRG Newton with 0th order GMRES polynomials with PC regenerated with no sparsity change in parallel" $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 0 \ - -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton +# + @echo "" + @echo "Test AIRG Newton with 1st order GMRES polynomials with PC reused with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_poly_order 1 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with 1st order GMRES polynomials with PC regenerated with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 1 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton # @echo "" @echo "Test solving isotropic diffusion with fast coarsening and near-nullspace in parallel" From aa688c2100538cfda7964792da324a15c05134e5 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 15 Jan 2026 23:11:31 +0000 Subject: [PATCH 03/41] Enable the use of not fixed sparsity assembled newton polynomial form of gmres polynomial --- src/Gmres_Poly_Newton.F90 | 162 +++++++++++++++++++++++++++++++++++++- tests/Makefile | 19 +++++ 2 files changed, 180 insertions(+), 1 deletion(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index d4f4e27..5f32bab 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -692,12 +692,13 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! Local variables PetscInt :: global_rows, global_cols, local_rows, local_cols - integer :: comm_size, errorcode + integer :: comm_size, errorcode, order PetscErrorCode :: ierr MPIU_Comm :: MPI_COMM_MATRIX type(mat_ctxtype), pointer :: mat_ctx=>null() logical :: reuse_triggered PetscReal :: square_sum + type(tMat) :: mat_product, temp_mat_A, temp_mat_two, temp_mat_three, mat_product_k_plus_1 ! ~~~~~~ @@ -837,6 +838,165 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & end if + ! If we're constraining sparsity we've built a custom matrix-powers that assumes fixed sparsity + if (poly_sparsity_order < poly_order) then + + ! ! This routine is a custom one that builds our matrix powers and assumes fixed sparsity + ! ! so that it doen't have to do much comms + ! ! This also finishes off the asyn comms and computes the coefficients + ! call mat_mult_powers_share_sparsity(matrix, poly_order, poly_sparsity_order, buffers, coefficients, & + ! reuse_mat, reuse_submatrices, inv_matrix) + + ! ! Then just return + return + + end if + + ! ~~~~~~~~~~ + ! We are only here if we don't constrain_sparsity + ! ~~~~~~~~~~ + + ! If not re-using + ! Copy in the initial matrix + if (.NOT. reuse_triggered) then + ! Duplicate & copy the matrix, but ensure there is a diagonal present + call mat_duplicate_copy_plus_diag(matrix, .FALSE., inv_matrix) + else + ! For the powers > 1 the pattern of the original matrix will be different + ! to the resulting inverse + call MatCopy(matrix, inv_matrix, DIFFERENT_NONZERO_PATTERN, ierr) + end if + + ! Set to zero as we add in each product of terms + call MatScale(inv_matrix, 0d0, ierr) + + ! Don't set any off processor entries so no need for a reduction when assembling + call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) + + ! We start with an identity in mat_product + call generate_identity(matrix, mat_product) + + ! ~~~~~~~~~~~~ + ! Iterate over the order + ! This is basically the same as the MF application but we have to build the powers + ! ~~~~~~~~~~~~ + order = 1 + do while (order .le. poly_order - 1) + + ! Duplicate & copy the matrix, but ensure there is a diagonal present + ! temp_mat_A is going to store things with the sparsity of A + if (PetscObjectIsNull(temp_mat_A)) then + call mat_duplicate_copy_plus_diag(matrix, .FALSE., temp_mat_A) + else + ! Can reuse the sparsity + call mat_duplicate_copy_plus_diag(matrix, .TRUE., temp_mat_A) + end if + + ! If real this is easy + if (coefficients(order,2) == 0d0) then + + ! Skips eigenvalues that are numerically zero - see + ! the comment in calculate_gmres_polynomial_roots_newton + if (abs(coefficients(order,1)) < 1e-12) then + order = order + 1 + cycle + end if + + ! Then add the scaled version of each product + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(inv_matrix, 1d0/coefficients(order,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(inv_matrix, 1d0/coefficients(order,1), mat_product) + end if + + ! temp_mat_A = A_ff/theta_k + call MatScale(temp_mat_A, -1d0/coefficients(order,1), ierr) + ! temp_mat_A = I - A_ff/theta_k + call MatShift(temp_mat_A, 1d0, ierr) + + ! mat_product_k_plus_1 = mat_product * temp_mat_A + call MatMatMult(temp_mat_A, mat_product, & + MAT_INITIAL_MATRIX, 1.5d0, mat_product_k_plus_1, ierr) + call MatDestroy(mat_product, ierr) + mat_product = mat_product_k_plus_1 + + order = order + 1 + + ! Complex + else + + ! Skips eigenvalues that are numerically zero + if (coefficients(order,1)**2 + coefficients(order,2)**2 < 1e-12) then + order = order + 2 + cycle + end if + + ! Compute 2a I - A + ! Have to use the DIFFERENT_NONZERO_PATTERN here + ! temp_mat_A = -A + call MatScale(temp_mat_A, -1d0, ierr) + ! temp_mat_A = 2a I - A_ff + call MatShift(temp_mat_A, 2d0 * coefficients(order,1), ierr) + ! temp_mat_A = (2a I - A_ff)/(a^2 + b^2) + call MatScale(temp_mat_A, 1d0/(coefficients(order,1)**2 + coefficients(order,2)**2), ierr) + + call MatMatMult(temp_mat_A, mat_product, & + MAT_INITIAL_MATRIX, 1.5d0, temp_mat_two, ierr) + + ! Then add the scaled version of each product + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(inv_matrix, 1d0, temp_mat_two, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(inv_matrix, 1d0, temp_mat_two) + end if + + if (order .le. size(coefficients, 1) - 2) then + ! temp_mat_three = matrix * temp_mat_two + call MatMatMult(matrix, temp_mat_two, & + MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr) + call MatDestroy(temp_mat_two, ierr) + + ! Then add the scaled version of each product + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(mat_product, -1d0, temp_mat_three, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(mat_product, -1d0, temp_mat_three) + end if + call MatDestroy(temp_mat_three, ierr) + else + call MatDestroy(temp_mat_two, ierr) + end if + + ! Skip two evals + order = order + 2 + + end if + end do + + ! Final step if last root is real + if (coefficients(order,2) == 0d0) then + ! Add in the final term multiplied by 1/theta_poly_order + + ! Skips eigenvalues that are numerically zero + if (abs(coefficients(order,1)) > 1e-12) then + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(inv_matrix, 1d0/coefficients(order,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(inv_matrix, 1d0/coefficients(order,1), mat_product) + end if + end if + end if + + call MatDestroy(temp_mat_A, ierr) + call MatDestroy(mat_product, ierr) end subroutine build_gmres_polynomial_newton_inverse diff --git a/tests/Makefile b/tests/Makefile index 32f49de..d7c378c 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -272,6 +272,14 @@ run_tests_no_load_serial: ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 1 @echo "Test AIRG Newton with 1st order GMRES polynomials with PC regenerated with no sparsity change" ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 1 +# + @echo "" + @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC reused with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 + @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC regenerated with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 + @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 # @echo "" @echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change and polynomial coeffs stored" @@ -511,6 +519,17 @@ run_tests_no_load_parallel: -pc_air_a_drop 1e-3 -pc_air_inverse_type newton @echo "Test AIRG Newton with 1st order GMRES polynomials with PC regenerated with no sparsity change in parallel" $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 1 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton +# + @echo "" + @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC reused with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC regenerated with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 \ -pc_air_a_drop 1e-3 -pc_air_inverse_type newton # @echo "" From f336df67056ce4b274ad70867549bc0d4a911f11 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 15 Jan 2026 23:25:31 +0000 Subject: [PATCH 04/41] Rewrite the newton first order assembled so it doesn't compute theta_1*theta_2 --- src/Gmres_Poly_Newton.F90 | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 5f32bab..3721c54 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -764,10 +764,6 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! If we're here then we want an assembled approximate inverse ! ~~~~~~~~~~~~ reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) - - ! For the 0th and 1st order assembled polynomial we just combine the coefficients - ! to get the mononomial form and assemble it, which should be stable for such low order - ! For higher order we use the actual Newton form ! If we're zeroth order poly this is trivial as it's just 1/theta_1 I if (poly_order == 0) then @@ -795,7 +791,7 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! Have to be careful here, as we may be first order, but the second eigenvaule ! might have been set to zero thanks to the rank reducing solve - ! So we just check if the second imaginary part is zero and if it is + ! So we just check if the second real part is zero and if it is ! we just compute a 0th order inverse - annoyingly we can't call ! build_gmres_polynomial_newton_inverse_0th_order as that builds a MATDIAGONAL ! and in the tests there is a problem where we reuse the sparsity, in the first @@ -813,12 +809,20 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & return end if - ! result = -A_ff/(theta_1 * theta_2) - call MatScale(inv_matrix, -1d0/(coefficients(1, 1) * coefficients(2, 1)), ierr) + ! Could just compute the equivalent mononomial here to save some flops + ! but the whole point to doing the Newton form is to avoid the + ! theta_1 * theta_2 that would result + + ! result = -A_ff/theta_1 + call MatScale(inv_matrix, -1d0/(coefficients(1, 1)), ierr) + ! result = I -A_ff/theta_1 + call MatShift(inv_matrix, 1d0, ierr) + ! result = 1/theta_2 * (I -A_ff/theta_1) + call MatScale(inv_matrix, 1d0/(coefficients(2, 1)), ierr) - ! result = I * (1/theta_1 + 1/theta_2) - A_ff/(theta_1 * theta_2) + ! result = 1/theta_1 + 1/theta_2 * (I -A_ff/theta_1) ! Don't need an assemble as there is one called in this - call MatShift(inv_matrix, 1d0/(coefficients(1, 1)) + 1d0/(coefficients(2, 1)), ierr) + call MatShift(inv_matrix, 1d0/(coefficients(1, 1)), ierr) ! Complex conjugate roots, a +- ib else From bf9a0da159c2d53262384fd4d987fe854a083cc2 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 15 Jan 2026 23:35:57 +0000 Subject: [PATCH 05/41] Was accidently finishing an order early for the newton assembled not fixed sparsity --- src/Gmres_Poly_Newton.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 3721c54..5ec8433 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -885,7 +885,7 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! This is basically the same as the MF application but we have to build the powers ! ~~~~~~~~~~~~ order = 1 - do while (order .le. poly_order - 1) + do while (order .le. size(coefficients, 1) - 1) ! Duplicate & copy the matrix, but ensure there is a diagonal present ! temp_mat_A is going to store things with the sparsity of A @@ -984,7 +984,7 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & end do ! Final step if last root is real - if (coefficients(order,2) == 0d0) then + if (coefficients(size(coefficients,1),2) == 0d0) then ! Add in the final term multiplied by 1/theta_poly_order ! Skips eigenvalues that are numerically zero From ad448b89ec16ee1f5a7fd8cc1ab92fe17daadc45 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 16 Jan 2026 16:10:09 +0000 Subject: [PATCH 06/41] Tidy variable names and fix comment --- src/Gmres_Poly.F90 | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/Gmres_Poly.F90 b/src/Gmres_Poly.F90 index 552f77a..309b42e 100644 --- a/src/Gmres_Poly.F90 +++ b/src/Gmres_Poly.F90 @@ -851,7 +851,6 @@ subroutine mat_mult_powers_share_sparsity_cpu(matrix, poly_order, poly_sparsity_ ! Compute matrix powers c = coeff(1) * I + coeff(2) * A + coeff(3) * A^2 + coeff(4) * A^3 + ... ! where a c and the powers all share the same sparsity as the power input in poly_sparsity_order - ! Assuming cmat has not been built/allocated ! This also finishes the async comms required to compute the gmres poly coefficients if buffers%request is allocated ! ~~~~~~~~~~ @@ -1655,7 +1654,7 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity(matrix, poly_order, integer :: order PetscErrorCode :: ierr logical :: reuse_triggered - type(tVec) :: rhs_copy, diag_vec, power_vec + type(tVec) :: inv_vec, diag_vec, power_vec ! ~~~~~~ reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) @@ -1666,9 +1665,9 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity(matrix, poly_order, ! This stores D^order if (.NOT. reuse_triggered) then - call VecDuplicate(diag_vec, rhs_copy, ierr) + call VecDuplicate(diag_vec, inv_vec, ierr) else - call MatDiagonalGetDiagonal(inv_matrix, rhs_copy, ierr) + call MatDiagonalGetDiagonal(inv_matrix, inv_vec, ierr) end if call VecCopy(diag_vec, power_vec, ierr) @@ -1678,22 +1677,22 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity(matrix, poly_order, call finish_gmres_polynomial_coefficients_power(poly_order, buffers, coefficients) ! Set: alpha_0 * I - call VecSet(rhs_copy, coefficients(1), ierr) + call VecSet(inv_vec, coefficients(1), ierr) ! Calculate: alpha_0 * I + alpha_1 * D + alpha_2 * D^2 do order = 1, poly_order - call VecAXPY(rhs_copy, coefficients(order+1), power_vec, ierr) + call VecAXPY(inv_vec, coefficients(order+1), power_vec, ierr) ! Compute power_vec = power_vec * D if (order /= poly_order) call VecPointwiseMult(power_vec, power_vec, diag_vec, ierr) end do ! We may be reusing with the same sparsity if (.NOT. reuse_triggered) then - ! The matrix takes ownership of rhs_copy and increases ref counter - call MatCreateDiagonal(rhs_copy, inv_matrix, ierr) - call VecDestroy(rhs_copy, ierr) + ! The matrix takes ownership of inv_vec and increases ref counter + call MatCreateDiagonal(inv_vec, inv_matrix, ierr) + call VecDestroy(inv_vec, ierr) else - call MatDiagonalRestoreDiagonal(inv_matrix, rhs_copy, ierr) + call MatDiagonalRestoreDiagonal(inv_matrix, inv_vec, ierr) end if call VecDestroy(diag_vec, ierr) From 4e94affb194f4df50b52152fed7e78ae985e151b Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 16 Jan 2026 16:10:56 +0000 Subject: [PATCH 07/41] Enable the use of 0th order fixed sparsity (diagonal) assembled newton polynomial form of gmres polynomial --- src/Gmres_Poly_Newton.F90 | 694 +++++++++++++++++++++++++++++++++++++- tests/Makefile | 16 +- 2 files changed, 704 insertions(+), 6 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 5ec8433..a7896ce 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -669,6 +669,560 @@ subroutine petsc_matvec_gmres_newton_mf_residual(mat, x, y) end do end subroutine petsc_matvec_gmres_newton_mf_residual +!------------------------------------------------------------------------------------------------------------------------ + + subroutine mat_mult_powers_share_sparsity_newton(matrix, poly_order, poly_sparsity_order, coefficients, & + reuse_mat, reuse_submatrices, cmat) + + ! Wrapper around mat_mult_powers_share_sparsity_cpu and mat_mult_powers_share_sparsity_kokkos + + ! ~~~~~~~~~~ + ! Input + type(tMat), target, intent(in) :: matrix + integer, intent(in) :: poly_order, poly_sparsity_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: reuse_mat, cmat + type(tMat), dimension(:), pointer, intent(inout) :: reuse_submatrices + +#if defined(PETSC_HAVE_KOKKOS) + integer(c_long_long) :: A_array, B_array, reuse_array + integer :: errorcode, reuse_int_cmat, reuse_int_reuse_mat + PetscErrorCode :: ierr + MatType :: mat_type + Mat :: temp_mat, temp_mat_reuse, temp_mat_compare + PetscScalar normy; + logical :: reuse_triggered_cmat, reuse_triggered_reuse_mat + type(c_ptr) :: coefficients_ptr + type(tMat) :: reuse_mat_cpu + type(tMat), dimension(:), pointer :: reuse_submatrices_cpu +#endif + ! ~~~~~~~~~~ + + ! ~~~~~~~~~~ + ! Special case if we just want to return a gmres polynomial with the sparsity of the diagonal + ! This is like a damped Jacobi + ! ~~~~~~~~~~ +if (poly_sparsity_order == 0) then + + call build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly_order, & + coefficients, cmat) + + return +end if + +#if defined(PETSC_HAVE_KOKKOS) + + call MatGetType(matrix, mat_type, ierr) + if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & + mat_type == MATAIJKOKKOS) then + + A_array = matrix%v + reuse_triggered_cmat = .NOT. PetscObjectIsNull(cmat) + reuse_triggered_reuse_mat = .NOT. PetscObjectIsNull(reuse_mat) + reuse_int_cmat = 0 + if (reuse_triggered_cmat) then + reuse_int_cmat = 1 + B_array = cmat%v + end if + reuse_int_reuse_mat = 0 + if (reuse_triggered_reuse_mat) then + reuse_int_reuse_mat = 1 + end if + reuse_array = reuse_mat%v + coefficients_ptr = c_loc(coefficients) + + ! call mat_mult_powers_share_sparsity_newton_kokkos(A_array, poly_order, poly_sparsity_order, & + ! coefficients_ptr, reuse_int_reuse_mat, reuse_array, reuse_int_cmat, B_array) + + reuse_mat%v = reuse_array + cmat%v = B_array + + ! If debugging do a comparison between CPU and Kokkos results + if (kokkos_debug()) then + + ! If we're doing reuse and debug, then we have to always output the result + ! from the cpu version, as it will have coo preallocation structures set + ! They aren't copied over if you do a matcopy (or matconvert) + ! If we didn't do that the next time we come through this routine + ! and try to call the cpu version with reuse, it will segfault + if (reuse_triggered_cmat) then + temp_mat = cmat + call MatConvert(cmat, MATSAME, MAT_INITIAL_MATRIX, temp_mat_compare, ierr) + else + temp_mat_compare = cmat + end if + + ! Debug check if the CPU and Kokkos versions are the same + ! We send in an empty reuse_mat_cpu here always, as we can't pass through + ! the same one Kokkos uses as it now only gets out the non-local rows we need + ! (ie reuse_mat and reuse_mat_cpu are no longer the same size) + reuse_submatrices_cpu => null() + call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & + coefficients, reuse_mat_cpu, reuse_submatrices_cpu, temp_mat) + call destroy_matrix_reuse(reuse_mat_cpu, reuse_submatrices_cpu) + + call MatConvert(temp_mat, MATSAME, MAT_INITIAL_MATRIX, & + temp_mat_reuse, ierr) + + call MatAXPYWrapper(temp_mat_reuse, -1d0, temp_mat_compare) + call MatNorm(temp_mat_reuse, NORM_FROBENIUS, normy, ierr) + ! There is floating point compute in these inverses, so we have to be a + ! bit more tolerant to rounding differences + if (normy .gt. 1d-11 .OR. normy/=normy) then + !call MatFilter(temp_mat_reuse, 1d-14, PETSC_TRUE, PETSC_FALSE, ierr) + !call MatView(temp_mat_reuse, PETSC_VIEWER_STDOUT_WORLD, ierr) + print *, "Kokkos and CPU versions of mat_mult_powers_share_sparsity do not match" + + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + call MatDestroy(temp_mat_reuse, ierr) + if (.NOT. reuse_triggered_cmat) then + call MatDestroy(cmat, ierr) + else + call MatDestroy(temp_mat_compare, ierr) + end if + cmat = temp_mat + end if + + else + + call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & + coefficients, reuse_mat, reuse_submatrices, cmat) + + end if +#else + call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & + coefficients, reuse_mat, reuse_submatrices, cmat) +#endif + + ! ~~~~~~~~~~ + + end subroutine mat_mult_powers_share_sparsity_newton + +!------------------------------------------------------------------------------------------------------------------------ + + subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, coefficients, & + reuse_mat, reuse_submatrices, cmat) + + ! Compute newton powers with the same sparsity + + ! ~~~~~~~~~~ + ! Input + type(tMat), target, intent(in) :: matrix + integer, intent(in) :: poly_order, poly_sparsity_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: reuse_mat, cmat + type(tMat), dimension(:), pointer, intent(inout) :: reuse_submatrices + + PetscInt :: local_rows, local_cols, global_rows, global_cols + PetscInt :: global_row_start, global_row_end_plus_one, row_index_into_submatrix + PetscInt :: global_col_start, global_col_end_plus_one, n, ncols, ncols_two, ifree, max_nnzs + PetscInt :: i_loc, j_loc, row_size, rows_ao, cols_ao, rows_ad, cols_ad, shift = 0 + integer :: errorcode, match_counter, term, order + integer :: comm_size + PetscErrorCode :: ierr + integer, dimension(:), allocatable :: cols_index_one, cols_index_two + PetscInt, dimension(:), allocatable :: col_indices_off_proc_array, ad_indices, cols + PetscReal, dimension(:), allocatable :: vals + type(tIS), dimension(1) :: col_indices, row_indices + type(tMat) :: Ad, Ao + PetscInt, dimension(:), pointer :: colmap + logical :: deallocate_submatrices = .FALSE. + type(c_ptr) :: vals_c_ptr + type(tMat), dimension(size(coefficients)-1), target :: matrix_powers + type(tMat), pointer :: mat_sparsity_match + type(int_vec), dimension(:), allocatable :: symbolic_ones + type(real_vec), dimension(:), allocatable :: symbolic_vals + integer(c_long_long) A_array + MPIU_Comm :: MPI_COMM_MATRIX + PetscReal, dimension(:), allocatable :: vals_power_temp, vals_previous_power_temp + PetscInt, dimension(:), pointer :: submatrices_ia, submatrices_ja, cols_two_ptr, cols_ptr + PetscReal, dimension(:), pointer :: vals_two_ptr, vals_ptr + real(c_double), pointer :: submatrices_vals(:) + logical :: reuse_triggered + PetscBool :: symmetric = PETSC_FALSE, inodecompressed = PETSC_FALSE, done + PetscInt, parameter :: one = 1, zero = 0 + + ! ~~~~~~~~~~ + + if (poly_sparsity_order .ge. size(coefficients)-1) then + print *, "Requested sparsity is greater than or equal to the order" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + + call PetscObjectGetComm(matrix, MPI_COMM_MATRIX, ierr) + ! Get the comm size + call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) + + ! Get the local sizes + call MatGetLocalSize(matrix, local_rows, local_cols, ierr) + call MatGetSize(matrix, global_rows, global_cols, ierr) + ! This returns the global index of the local portion of the matrix + call MatGetOwnershipRange(matrix, global_row_start, global_row_end_plus_one, ierr) + call MatGetOwnershipRangeColumn(matrix, global_col_start, global_col_end_plus_one, ierr) + + reuse_triggered = .NOT. PetscObjectIsNull(cmat) + + ! ! ~~~~~~~~~~ + ! ! Compute any matrix powers we might need to constrain sparsity and start assembling the + ! ! components of the output matrix up to the order of poly_sparsity_order + ! ! The powers higher than poly_sparsity_order can be done with only + ! ! a single bit of comms and is done below this + ! ! ~~~~~~~~~~ + + ! ! matrix_powers stores all the powers of the input matrix + ! matrix_powers(1) = matrix + + ! ! What power of A do we want to match the sparsity of + ! ! Compute the power we need if we're two or above + ! do order = 2, poly_sparsity_order + + ! ! Let's just store each power, that way we can set the sparsity + ! ! as the highest (unconstrained) power and do the mataxpy with a subset of entries + ! ! Takes more memory to do this but is faster + ! call MatMatMult(matrix, matrix_powers(order-1), & + ! MAT_INITIAL_MATRIX, 1.5d0, matrix_powers(order), ierr) + ! end do + + ! ! mat_sparsity_match now contains the sparsity of the power of A we want to match + ! mat_sparsity_match => matrix_powers(poly_sparsity_order) + + ! ! Copy in the highest unconstrained power + ! ! Duplicate & copy the matrix, but ensure there is a diagonal present + ! call mat_duplicate_copy_plus_diag(matrix_powers(poly_sparsity_order), reuse_triggered, cmat) + + ! ! We know we will never have non-zero locations outside of the highest constrained sparsity power + ! call MatSetOption(cmat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) + ! call MatSetOption(cmat, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) + ! ! We know we are only going to insert local vals + ! ! These options should turn off any reductions in the assembly + ! call MatSetOption(cmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) + + ! ! ~~~~~~~~~~~~ + ! ! If we're in parallel we need to get the off-process rows of matrix that correspond + ! ! to the columns of mat_sparsity_match + ! ! We can therefore do the matmult for every constrained power locally with just that data + ! ! ~~~~~~~~~~~~ + ! ! Have to double check comm_size /= 1 as we might be on a subcommunicator and we can't call + ! ! MatMPIAIJGetSeqAIJ specifically if that's the case + ! if (comm_size /= 1) then + + ! ! ~~~~ + ! ! Get the cols + ! ! ~~~~ + ! call MatMPIAIJGetSeqAIJ(mat_sparsity_match, Ad, Ao, colmap, ierr) + + ! call MatGetSize(Ad, rows_ad, cols_ad, ierr) + ! ! We know the col size of Ao is the size of colmap, the number of non-zero offprocessor columns + ! call MatGetSize(Ao, rows_ao, cols_ao, ierr) + + ! ! For the column indices we need to take all the columns of mat_sparsity_match + ! A_array = mat_sparsity_match%v + + ! ! These are the global indices of the columns we want + ! allocate(col_indices_off_proc_array(cols_ad + cols_ao)) + ! allocate(ad_indices(cols_ad)) + ! ! Local rows (as global indices) + ! do ifree = 1, cols_ad + ! ad_indices(ifree) = global_row_start + ifree - 1 + ! end do + + ! ! col_indices_off_proc_array is now sorted, which are the global indices of the columns we want + ! call merge_pre_sorted(ad_indices, colmap, col_indices_off_proc_array) + ! deallocate(ad_indices) + + ! ! Create the sequential IS we want with the cols we want (written as global indices) + ! call ISCreateGeneral(PETSC_COMM_SELF, cols_ad + cols_ao, & + ! col_indices_off_proc_array, PETSC_USE_POINTER, col_indices(1), ierr) + ! call ISCreateGeneral(PETSC_COMM_SELF, cols_ao, & + ! colmap, PETSC_USE_POINTER, row_indices(1), ierr) + + ! ! ~~~~~~~ + ! ! Now we can pull out the chunk of matrix that we need + ! ! ~~~~~~~ + + ! ! We need off-processor rows to compute matrix powers + ! ! Setting this is necessary to avoid an allreduce when calling createsubmatrices + ! ! This will be reset to false after the call to createsubmatrices + ! call MatSetOption(matrix, MAT_SUBMAT_SINGLEIS, PETSC_TRUE, ierr) + + ! ! Now this will be doing comms to get the non-local rows we want + ! ! But only including the columns of the local fixed sparsity, as we don't need all the + ! ! columns of the non-local entries unless we are doing a full matmatmult + ! ! This returns a sequential matrix + ! if (.NOT. PetscObjectIsNull(reuse_mat)) then + ! reuse_submatrices(1) = reuse_mat + ! call MatCreateSubMatrices(matrix, one, row_indices, col_indices, MAT_REUSE_MATRIX, reuse_submatrices, ierr) + ! else + ! call MatCreateSubMatrices(matrix, one, row_indices, col_indices, MAT_INITIAL_MATRIX, reuse_submatrices, ierr) + ! reuse_mat = reuse_submatrices(1) + ! end if + ! row_size = size(col_indices_off_proc_array) + ! call ISDestroy(col_indices(1), ierr) + ! call ISDestroy(row_indices(1), ierr) + + ! ! Easy in serial as we have everything we neeed + ! else + + ! Ad = mat_sparsity_match + ! cols_ad = local_cols + ! allocate(reuse_submatrices(1)) + ! deallocate_submatrices = .TRUE. + ! reuse_submatrices(1) = matrix + ! row_size = local_rows + ! allocate(col_indices_off_proc_array(local_rows)) + ! do ifree = 1, local_rows + ! col_indices_off_proc_array(ifree) = ifree-1 + ! end do + ! end if + + ! ! ~~~~~~~~~ + ! ! Now that we are here, reuse_submatrices(1) contains A^poly_sparsity_order with all of the rows + ! ! that correspond to the non-zero columns of matrix + ! ! ~~~~~~~~~ + + ! ! Have to get the max nnzs of the local and off-local rows we've just retrieved + ! max_nnzs = 0 + ! do ifree = global_row_start, global_row_end_plus_one-1 + ! call MatGetRow(matrix, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + ! if (ncols > max_nnzs) max_nnzs = ncols + ! call MatRestoreRow(matrix, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + ! end do + ! if (comm_size /= 1) then + ! do ifree = 1, cols_ao + ! call MatGetRow(reuse_submatrices(1), ifree-1, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + ! if (ncols > max_nnzs) max_nnzs = ncols + ! call MatRestoreRow(reuse_submatrices(1), ifree-1, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + ! end do + ! end if + ! ! and also the sparsity power + ! do ifree = global_row_start, global_row_end_plus_one-1 + ! call MatGetRow(mat_sparsity_match, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + ! if (ncols > max_nnzs) max_nnzs = ncols + ! call MatRestoreRow(mat_sparsity_match, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + ! end do + + ! ! ~~~~~~~~ + ! ! Get pointers to the sequential aij structure so we don't have to put critical regions + ! ! around the matgetrow + ! ! ~~~~~~~~ + ! call MatGetRowIJ(reuse_submatrices(1),shift,symmetric,inodecompressed,n,submatrices_ia,submatrices_ja,done,ierr) + ! if (.NOT. done) then + ! print *, "Pointers not set in call to MatGetRowIJF" + ! call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + ! end if + ! ! Returns the wrong size pointer and can break if that size goes negative?? + ! !call MatSeqAIJGetArrayF90(reuse_submatrices(1),submatrices_vals,ierr); + ! A_array = reuse_submatrices(1)%v + ! ! Now we must never overwrite the values in this pointer, and we must + ! ! never call restore on it, see comment on top of the commented out + ! ! MatSeqAIJRestoreArray below + ! call MatSeqAIJGetArrayF90_mine(A_array, vals_c_ptr) + ! call c_f_pointer(vals_c_ptr, submatrices_vals, shape=[size(submatrices_ja)]) + + ! ! ~~~~~~~~~~ + + ! allocate(cols(max_nnzs)) + ! allocate(vals(max_nnzs)) + ! allocate(vals_power_temp(max_nnzs)) + ! allocate(vals_previous_power_temp(max_nnzs)) + ! allocate(cols_index_one(max_nnzs)) + ! allocate(cols_index_two(max_nnzs)) + + ! ! Scale the highest constrained power + ! call MatScale(cmat, coefficients(poly_sparsity_order+1), ierr) + + ! ! Then go backwards and add in each of the coefficients * A^order from the second highest order down + ! do order = poly_sparsity_order - 1, 1, -1 + + ! ! Do result = alpha_1 * A_ff + alpha_2 * A_ff^2 + .... + ! ! Can use SUBSET_NONZERO_PATTERN as we have put the highest order power in first + ! call MatAXPY(cmat, coefficients(order+1), matrix_powers(order), SUBSET_NONZERO_PATTERN , ierr) + ! end do + + ! ! Add in the 0th order term + ! do i_loc = 1, local_rows + + ! ! Add in the I term - 0th order term + ! call MatSetValue(cmat, global_row_start + i_loc-1, global_row_start + i_loc-1, & + ! coefficients(1), ADD_VALUES, ierr) + ! end do + + ! ! ~~~~~~~~~~~~ + ! ! From here we now have cmat with the correct values up to the power poly_sparsity_order + ! ! and hence we want to add in the sparsity constrained powers + ! ! ~~~~~~~~~~~~ + + ! ! Now go through and compute the sum of the matrix powers + ! ! We're doing row-wise matmatmults here assuming the fixed sparsity + ! ! We exploit the fact that the subsequent matrix powers can be done + ! ! one row at a time, so we only have to retrieve the needed vals from mat_sparsity_match once + ! do i_loc = 1, local_rows + + ! ! Get the row of mat_sparsity_match + ! call MatGetRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, & + ! cols_ptr, vals_ptr, ierr) + ! ! Copying here because mat_sparsity_match and matrix are often the same matrix + ! ! and hence we can only have one active matgetrow + ! ncols = ncols_two + ! cols(1:ncols) = cols_ptr(1:ncols) + ! vals(1:ncols) = vals_ptr(1:ncols) + ! call MatRestoreRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, & + ! cols_ptr, vals_ptr, ierr) + + ! ! This is just a symbolic for the set of rows given in cols + ! ! Let's just do all the column matching and extraction of the values once + + ! ! Allocate some space to store the matching indices + ! allocate(symbolic_ones(ncols)) + ! allocate(symbolic_vals(ncols)) + ! row_index_into_submatrix = 1 + + ! ! This is a row-wise product + ! do j_loc = 1, ncols + + ! ! If we're trying to access a local row in matrix + ! if (cols(j_loc) .ge. global_row_start .AND. cols(j_loc) < global_row_end_plus_one) then + + ! call MatGetRow(matrix, cols(j_loc), ncols_two, & + ! cols_two_ptr, vals_two_ptr, ierr) + + ! ! If we're trying to access a non-local row in matrix + ! else + + ! ! this is local row index we want into reuse_submatrices(1) (as row_indices used to extract are just colmap) + ! ! We know cols is sorted, so every non-local index will be greater than the last one + ! ! (it's just that cols could have some local ones between different non-local) + ! ! colmap is also sorted and we know every single non-local entry in cols(j_loc) is in colmap + ! do while (row_index_into_submatrix .le. cols_ao .AND. colmap(row_index_into_submatrix) .lt. cols(j_loc)) + ! row_index_into_submatrix = row_index_into_submatrix + 1 + ! end do + + ! ! This is the number of columns + ! ncols_two = submatrices_ia(row_index_into_submatrix+1) - submatrices_ia(row_index_into_submatrix) + ! allocate(cols_two_ptr(ncols_two)) + ! ! This is the local column indices in reuse_submatrices(1) + ! cols_two_ptr = submatrices_ja(submatrices_ia(row_index_into_submatrix)+1:submatrices_ia(row_index_into_submatrix+1)) + ! ! Because col_indices_off_proc_array (and hence the column indices in reuse_submatrices(1) is sorted, + ! ! then cols_two_ptr contains the sorted global column indices + ! cols_two_ptr = col_indices_off_proc_array(cols_two_ptr+1) + + ! ! This is the values + ! vals_two_ptr => & + ! submatrices_vals(submatrices_ia(row_index_into_submatrix)+1:submatrices_ia(row_index_into_submatrix+1)) + ! end if + + ! ! Search for the matching column + ! ! We're intersecting the global column indices of mat_sparsity_match (cols) and matrix (cols_two_ptr) + ! call intersect_pre_sorted_indices_only(cols(1:ncols), cols_two_ptr, cols_index_one, cols_index_two, match_counter) + + ! ! Don't need to do anything if we have no matches + ! if (match_counter == 0) then + ! ! Store that we can skip this entry + ! symbolic_ones(j_loc)%ptr => null() + ! symbolic_vals(j_loc)%ptr => null() + ! else + + ! ! These are the matching local column indices for this row of mat_sparsity_match + ! allocate(symbolic_ones(j_loc)%ptr(match_counter)) + ! symbolic_ones(j_loc)%ptr = cols_index_one(1:match_counter) + + ! ! These are the matching values of matrix + ! allocate(symbolic_vals(j_loc)%ptr(match_counter)) + ! symbolic_vals(j_loc)%ptr = vals_two_ptr(cols_index_two(1:match_counter)) + ! end if + + ! ! Restore local row of matrix + ! if (cols(j_loc) .ge. global_row_start .AND. cols(j_loc) < global_row_end_plus_one) then + ! call MatRestoreRow(matrix, cols(j_loc), ncols_two, & + ! cols_two_ptr, vals_two_ptr, ierr) + ! else + ! deallocate(cols_two_ptr) + ! end if + ! end do + + ! ! Start with the values of mat_sparsity_match in it + ! vals_previous_power_temp(1:ncols) = vals(1:ncols) + + ! ! Loop over any matrix powers + ! ! vals_power_temp stores the value of A^(term-1) for this row, and we update this as we go through + ! ! the term loop + ! do term = poly_sparsity_order+2, size(coefficients) + + ! ! We need to sum up the product of vals_previous_power_temp(j_loc) * matching columns + ! vals_power_temp(1:ncols) = 0 + + ! ! Have to finish all the columns before we move onto the next coefficient + ! do j_loc = 1, ncols + + ! ! If we have no matching columns cycle this row + ! if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle + + ! ! symbolic_vals(j_loc)%ptr has the matching values of A in it + ! vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) + & + ! vals_previous_power_temp(j_loc) * symbolic_vals(j_loc)%ptr + + ! end do + + ! ! ~~~~~~~~~~~ + ! ! Now can add the value of coeff * A^(term-1) to our matrix + ! ! Can skip this if coeff is zero, but still need to compute A^(term-1) + ! ! for the next time through + ! ! ~~~~~~~~~~~ + ! if (ncols /= 0 .AND. coefficients(term) /= 0d0) then + ! call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & + ! coefficients(term) * vals_power_temp, ADD_VALUES, ierr) + ! end if + + ! ! This should now have the value of A^(term-1) in it + ! vals_previous_power_temp(1:ncols) = vals_power_temp(1:ncols) + ! end do + + ! ! Delete our symbolic + ! do j_loc = 1, ncols + ! if (associated(symbolic_ones(j_loc)%ptr)) then + ! deallocate(symbolic_ones(j_loc)%ptr) + ! deallocate(symbolic_vals(j_loc)%ptr) + ! end if + ! end do + ! deallocate(symbolic_vals, symbolic_ones) + ! end do + + ! call MatRestoreRowIJ(reuse_submatrices(1),shift,symmetric,inodecompressed,n,submatrices_ia,submatrices_ja,done,ierr) + ! ! We very deliberately don't call restorearray here! + ! ! There is no matseqaijgetarrayread or matseqaijrestorearrayread in Fortran + ! ! Those routines don't increment the PetscObjectStateGet which tells petsc + ! ! the mat has changed. Hence above we directly access the data pointer with + ! ! a call to MatSeqAIJGetArrayF90_mine and then never write into it + ! ! If we call the restorearrayf90, that does increment the object state + ! ! even though we only read from the array + ! ! That would mean if we pass in a pc->pmat for example, just setting up a pc + ! ! would trigger petsc setting up the pc on every iteration of the pc + ! ! call MatSeqAIJRestoreArray(reuse_submatrices(1),submatrices_vals,ierr); + + ! ! ~~~~~~~~~~~ + + ! ! Do the assembly, should need zero reductions in this given we've set the + ! ! flags above + ! call MatAssemblyBegin(cmat, MAT_FINAL_ASSEMBLY, ierr) + + ! ! Delete temporaries + ! do order = 2, poly_sparsity_order + ! call MatDestroy(matrix_powers(order), ierr) + ! end do + ! if (deallocate_submatrices) then + ! deallocate(reuse_submatrices) + ! reuse_submatrices => null() + ! end if + + ! deallocate(col_indices_off_proc_array) + ! deallocate(cols, vals, vals_power_temp, vals_previous_power_temp, cols_index_one, cols_index_two) + + ! ! Finish assembly + ! call MatAssemblyEnd(cmat, MAT_FINAL_ASSEMBLY, ierr) + + + end subroutine mat_mult_powers_share_sparsity_newton_cpu ! ------------------------------------------------------------------------------------------------------------------------------- @@ -845,11 +1399,10 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! If we're constraining sparsity we've built a custom matrix-powers that assumes fixed sparsity if (poly_sparsity_order < poly_order) then - ! ! This routine is a custom one that builds our matrix powers and assumes fixed sparsity - ! ! so that it doen't have to do much comms - ! ! This also finishes off the asyn comms and computes the coefficients - ! call mat_mult_powers_share_sparsity(matrix, poly_order, poly_sparsity_order, buffers, coefficients, & - ! reuse_mat, reuse_submatrices, inv_matrix) + ! This routine is a custom one that builds our matrix powers and assumes fixed sparsity + ! so that it doen't have to do much comms + call mat_mult_powers_share_sparsity_newton(matrix, poly_order, poly_sparsity_order, coefficients, & + reuse_mat, reuse_submatrices, inv_matrix) ! ! Then just return return @@ -1053,6 +1606,137 @@ subroutine build_gmres_polynomial_newton_inverse_0th_order(matrix, poly_order, c end subroutine build_gmres_polynomial_newton_inverse_0th_order + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly_order, coefficients, & + inv_matrix) + + ! Specific inverse with 0th order sparsity + + ! ~~~~~~ + type(tMat), intent(in) :: matrix + integer, intent(in) :: poly_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: inv_matrix + + ! Local variables + integer :: order + PetscErrorCode :: ierr + logical :: reuse_triggered + type(tVec) :: inv_vec, diag_vec, product_vec, temp_vec_A, one_vec, temp_vec_two + ! ~~~~~~ + + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + + ! Our matrix has to be square + call MatCreateVecs(matrix, product_vec, diag_vec, ierr) + call MatGetDiagonal(matrix, diag_vec, ierr) + + ! This stores D^order + if (.NOT. reuse_triggered) then + call VecDuplicate(diag_vec, inv_vec, ierr) + else + call MatDiagonalGetDiagonal(inv_matrix, inv_vec, ierr) + end if + call VecDuplicate(diag_vec, temp_vec_A, ierr) + call VecDuplicate(diag_vec, one_vec, ierr) + call VecDuplicate(diag_vec, temp_vec_two, ierr) + + ! Set to zero as we add to it + call VecSet(inv_vec, 0d0, ierr) + ! We start with an identity in product_vec + call VecSet(product_vec, 1d0, ierr) + call VecSet(one_vec, 1d0, ierr) + + order = 1 + do while (order .le. size(coefficients, 1) - 1) + + ! temp_vec_A is going to store things with the sparsity of A + call VecCopy(diag_vec, temp_vec_A, ierr) + + ! If real this is easy + if (coefficients(order,2) == 0d0) then + + ! Skips eigenvalues that are numerically zero - see + ! the comment in calculate_gmres_polynomial_roots_newton + if (abs(coefficients(order,1)) < 1e-12) then + order = order + 1 + cycle + end if + + call VecAXPY(inv_vec, 1d0/coefficients(order,1), product_vec, ierr) + + ! temp_vec_A = A_ff/theta_k + call VecScale(temp_vec_A, -1d0/coefficients(order,1), ierr) + ! temp_vec_A = I - A_ff/theta_k + call VecAXPY(temp_vec_A, 1d0, one_vec, ierr) + + ! product_vec = product_vec * temp_vec_A + call VecPointwiseMult(product_vec, product_vec, temp_vec_A, ierr) + + order = order + 1 + + ! Complex + else + + ! Skips eigenvalues that are numerically zero + if (coefficients(order,1)**2 + coefficients(order,2)**2 < 1e-12) then + order = order + 2 + cycle + end if + + ! Compute 2a I - A + ! temp_vec_A = -A + call VecScale(temp_vec_A, -1d0, ierr) + ! temp_vec_A = 2a I - A_ff + call VecAXPY(temp_vec_A, 2d0 * coefficients(order,1), one_vec, ierr) + ! temp_vec_A = (2a I - A_ff)/(a^2 + b^2) + call VecScale(temp_vec_A, 1d0/(coefficients(order,1)**2 + coefficients(order,2)**2), ierr) + + ! temp_vec_two = temp_vec_A * product_vec + call VecPointwiseMult(temp_vec_two, temp_vec_A, product_vec, ierr) + call VecAXPY(inv_vec, 1d0, temp_vec_two, ierr) + + if (order .le. size(coefficients, 1) - 2) then + ! temp_vec_two = A * temp_vec_two + call VecPointwiseMult(temp_vec_two, diag_vec, temp_vec_two, ierr) + call VecAXPY(product_vec, -1d0, temp_vec_two, ierr) + end if + + ! Skip two evals + order = order + 2 + + end if + end do + + ! Final step if last root is real + if (coefficients(size(coefficients,1),2) == 0d0) then + ! Add in the final term multiplied by 1/theta_poly_order + + ! Skips eigenvalues that are numerically zero + if (abs(coefficients(order,1)) > 1e-12) then + call VecAXPY(inv_vec, 1d0/coefficients(order,1), product_vec, ierr) + end if + end if + + ! We may be reusing with the same sparsity + if (.NOT. reuse_triggered) then + ! The matrix takes ownership of inv_vec and increases ref counter + call MatCreateDiagonal(inv_vec, inv_matrix, ierr) + call VecDestroy(inv_vec, ierr) + else + call MatDiagonalRestoreDiagonal(inv_matrix, inv_vec, ierr) + end if + + call VecDestroy(diag_vec, ierr) + call VecDestroy(product_vec, ierr) + call VecDestroy(temp_vec_A, ierr) + call VecDestroy(one_vec, ierr) + call VecDestroy(temp_vec_two, ierr) + + end subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton + ! ------------------------------------------------------------------------------------------------------------------------------- diff --git a/tests/Makefile b/tests/Makefile index d7c378c..514ae5e 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -280,6 +280,12 @@ run_tests_no_load_serial: ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change" ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 +# + @echo "" + @echo "Test AIRG Newton with GMRES polynomials with PC reused with no sparsity change with 0th order fixed sparsity" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_inverse_sparsity_order 0 + @echo "Test AIRG Newton with GMRES polynomials with PC regenerated with no sparsity change with 0th order fixed sparsity" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0 # @echo "" @echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change and polynomial coeffs stored" @@ -530,7 +536,15 @@ run_tests_no_load_parallel: -pc_air_a_drop 1e-3 -pc_air_inverse_type newton @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change in parallel" $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 \ - -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton +# + @echo "" + @echo "Test AIRG Newton with GMRES polynomials with PC reused with no sparsity change in parallel with 0th order fixed sparsity" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_inverse_sparsity_order 0 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with GMRES polynomials with PC regenerated with no sparsity change in parallel with 0th order fixed sparsity" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton # @echo "" @echo "Test solving isotropic diffusion with fast coarsening and near-nullspace in parallel" From 784dcc7dd10744a6fd9c30a16ff1f5a67c5cfbce Mon Sep 17 00:00:00 2001 From: sdargavi Date: Sat, 17 Jan 2026 00:36:15 +0000 Subject: [PATCH 08/41] Most of the work for the fixed sparsity Newton polynomials on CPUs is done, just need to finish the higher order fixed sparsity terms in mat_mult_powers_share_sparsity_newton_cpu. --- src/Gmres_Poly_Newton.F90 | 1333 +++++++++++++++++++++---------------- 1 file changed, 754 insertions(+), 579 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index a7896ce..f47f437 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -469,7 +469,7 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) type(tVec) :: y ! Local - integer :: order, errorcode + integer :: i, errorcode PetscErrorCode :: ierr type(mat_ctxtype), pointer :: mat_ctx => null() @@ -487,24 +487,24 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) call VecSet(y, 0d0, ierr) ! ~~~~~~~~~~~~ - ! Iterate over the order + ! Iterate over the i ! ~~~~~~~~~~~~ - order = 1 - do while (order .le. size(mat_ctx%real_roots) - 1) + i = 1 + do while (i .le. size(mat_ctx%real_roots) - 1) ! If real this is easy - if (mat_ctx%imag_roots(order) == 0d0) then + if (mat_ctx%imag_roots(i) == 0d0) then ! Skips eigenvalues that are numerically zero - see ! the comment in calculate_gmres_polynomial_roots_newton - if (abs(mat_ctx%real_roots(order)) < 1e-12) then - order = order + 1 + if (abs(mat_ctx%real_roots(i)) < 1e-12) then + i = i + 1 cycle end if ! y = y + theta_i * MF_VEC_TEMP call VecAXPY(y, & - 1d0/mat_ctx%real_roots(order), & + 1d0/mat_ctx%real_roots(i), & mat_ctx%mf_temp_vec(MF_VEC_TEMP), ierr) ! MF_VEC_DIAG = A * MF_VEC_TEMP @@ -512,10 +512,10 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) call MatMult(mat_ctx%mat, mat_ctx%mf_temp_vec(MF_VEC_TEMP), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) ! MF_VEC_TEMP = MF_VEC_TEMP - theta_i * MF_VEC_DIAG call VecAXPY(mat_ctx%mf_temp_vec(MF_VEC_TEMP), & - -1d0/mat_ctx%real_roots(order), & + -1d0/mat_ctx%real_roots(i), & mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) - order = order + 1 + i = i + 1 ! If imaginary, then have to combine the e'val and its ! complex conjugate to keep the arithmetic real @@ -523,8 +523,8 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) else ! Skips eigenvalues that are numerically zero - if (mat_ctx%real_roots(order)**2 + mat_ctx%imag_roots(order)**2 < 1e-12) then - order = order + 2 + if (mat_ctx%real_roots(i)**2 + mat_ctx%imag_roots(i)**2 < 1e-12) then + i = i + 2 cycle end if @@ -532,27 +532,27 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) call MatMult(mat_ctx%mat, mat_ctx%mf_temp_vec(MF_VEC_TEMP), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) ! MF_VEC_DIAG = 2 * Re(theta_i) * MF_VEC_TEMP - MF_VEC_DIAG call VecAXPBY(mat_ctx%mf_temp_vec(MF_VEC_DIAG), & - 2 * mat_ctx%real_roots(order), & + 2 * mat_ctx%real_roots(i), & -1d0, & mat_ctx%mf_temp_vec(MF_VEC_TEMP), ierr) ! y = y + 1/(Re(theta_i)^2 + Imag(theta_i)^2) * MF_VEC_DIAG call VecAXPY(y, & - 1d0/(mat_ctx%real_roots(order)**2 + mat_ctx%imag_roots(order)**2), & + 1d0/(mat_ctx%real_roots(i)**2 + mat_ctx%imag_roots(i)**2), & mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) - if (order .le. size(mat_ctx%real_roots) - 2) then + if (i .le. size(mat_ctx%real_roots) - 2) then ! MF_VEC_RHS = A * MF_VEC_DIAG call MatMult(mat_ctx%mat, mat_ctx%mf_temp_vec(MF_VEC_DIAG), mat_ctx%mf_temp_vec(MF_VEC_RHS), ierr) ! MF_VEC_TEMP = MF_VEC_TEMP - 1/(Re(theta_i)^2 + Imag(theta_i)^2) * MF_VEC_RHS call VecAXPY(mat_ctx%mf_temp_vec(MF_VEC_TEMP), & - -1d0/(mat_ctx%real_roots(order)**2 + mat_ctx%imag_roots(order)**2), & + -1d0/(mat_ctx%real_roots(i)**2 + mat_ctx%imag_roots(i)**2), & mat_ctx%mf_temp_vec(MF_VEC_RHS), ierr) end if ! Skip two evals - order = order + 2 + i = i + 2 end if end do @@ -561,11 +561,11 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) if (mat_ctx%imag_roots(size(mat_ctx%real_roots)) == 0d0) then ! Skips eigenvalues that are numerically zero - if (abs(mat_ctx%real_roots(order)) > 1e-12) then + if (abs(mat_ctx%real_roots(i)) > 1e-12) then ! y = y + theta_i * MF_VEC_TEMP call VecAXPBY(y, & - 1d0/mat_ctx%real_roots(order), & + 1d0/mat_ctx%real_roots(i), & 1d0, & mat_ctx%mf_temp_vec(MF_VEC_TEMP), ierr) end if @@ -818,7 +818,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp PetscInt :: global_row_start, global_row_end_plus_one, row_index_into_submatrix PetscInt :: global_col_start, global_col_end_plus_one, n, ncols, ncols_two, ifree, max_nnzs PetscInt :: i_loc, j_loc, row_size, rows_ao, cols_ao, rows_ad, cols_ad, shift = 0 - integer :: errorcode, match_counter, term, order + integer :: errorcode, match_counter, term integer :: comm_size PetscErrorCode :: ierr integer, dimension(:), allocatable :: cols_index_one, cols_index_two @@ -829,7 +829,6 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp PetscInt, dimension(:), pointer :: colmap logical :: deallocate_submatrices = .FALSE. type(c_ptr) :: vals_c_ptr - type(tMat), dimension(size(coefficients)-1), target :: matrix_powers type(tMat), pointer :: mat_sparsity_match type(int_vec), dimension(:), allocatable :: symbolic_ones type(real_vec), dimension(:), allocatable :: symbolic_vals @@ -842,6 +841,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp logical :: reuse_triggered PetscBool :: symmetric = PETSC_FALSE, inodecompressed = PETSC_FALSE, done PetscInt, parameter :: one = 1, zero = 0 + logical :: output_first_complex ! ~~~~~~~~~~ @@ -863,363 +863,362 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp reuse_triggered = .NOT. PetscObjectIsNull(cmat) - ! ! ~~~~~~~~~~ - ! ! Compute any matrix powers we might need to constrain sparsity and start assembling the - ! ! components of the output matrix up to the order of poly_sparsity_order - ! ! The powers higher than poly_sparsity_order can be done with only - ! ! a single bit of comms and is done below this - ! ! ~~~~~~~~~~ - - ! ! matrix_powers stores all the powers of the input matrix - ! matrix_powers(1) = matrix - - ! ! What power of A do we want to match the sparsity of - ! ! Compute the power we need if we're two or above - ! do order = 2, poly_sparsity_order - - ! ! Let's just store each power, that way we can set the sparsity - ! ! as the highest (unconstrained) power and do the mataxpy with a subset of entries - ! ! Takes more memory to do this but is faster - ! call MatMatMult(matrix, matrix_powers(order-1), & - ! MAT_INITIAL_MATRIX, 1.5d0, matrix_powers(order), ierr) - ! end do - - ! ! mat_sparsity_match now contains the sparsity of the power of A we want to match - ! mat_sparsity_match => matrix_powers(poly_sparsity_order) - - ! ! Copy in the highest unconstrained power - ! ! Duplicate & copy the matrix, but ensure there is a diagonal present - ! call mat_duplicate_copy_plus_diag(matrix_powers(poly_sparsity_order), reuse_triggered, cmat) - - ! ! We know we will never have non-zero locations outside of the highest constrained sparsity power - ! call MatSetOption(cmat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) - ! call MatSetOption(cmat, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) - ! ! We know we are only going to insert local vals - ! ! These options should turn off any reductions in the assembly - ! call MatSetOption(cmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) + ! ~~~~~~~~~~ + ! Compute cmat for all powers up to poly_sparsity_order + ! We have to be more careful here than in the monomial case + ! In the mononomial case we just compute the matrix powers up to poly_sparsity_order + ! and add them times the coefficients to cmat + ! Here though we have to build the Newton basis polynomials + ! The complex conjugate roots are tricky as they build up two powers at a time + ! The powers higher than poly_sparsity_order can be done with only + ! a single bit of comms and is done below this + ! ~~~~~~~~~~ + output_first_complex = .FALSE. + if (poly_sparsity_order == 1) then + + ! If we've got first order sparsity, we want to build cmat up to first order + ! and then we add in higher order powers later + ! We can just pass in the first two roots to build the first order gmres polynomial + ! mat_sparsity_match gets out the parts of the product up to 1st order + ! for the real case this will be the equivalent of prod on line 5 of Alg 3 in Loe 2021 + ! I - 1/theta_1 A + ! whereas cmat will be 1/theta_1 + 1/theta_2 * (I - 1/theta_1 A) + ! For the complex case we instead pass out tmp from line 9 scaled by 1/(a^2 + b^2) + ! as this is the part of the product with sparsity up to A + ! This is because the prod for complex builds up the A^2 term for the next iteration + ! given it does two roots at a time + + ! Duplicate & copy the matrix, but ensure there is a diagonal present + call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, cmat) + + call build_gmres_polynomial_newton_inverse_1st_1st(matrix, one, & + coefficients(1:poly_sparsity_order + 1, 1:2), & + cmat, mat_sparsity_match) + else + + ! If we're any higher, then we build cmat up to that order + ! But we have to be careful because the last root we want to explicitly + ! build up to here (ie the power of the matrix given by poly_sparsity_order) + ! might be the first root of a complex conjugate pair + ! In that case cmat only contains part of the result up to poly_sparsity_order + ! Similarly mat_sparsity_match contains the product up to poly_sparsity_order + ! The rest gets added in below + ! output_first_complex records if poly_sparsity_order hits the first root + ! of a complex conjugate pair, as we need to know that below to add in the rest + ! of the poly_sparsity_order+1 term from that pair + ! before moving on to the rest of the higher order roots + call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & + cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex) + end if + + ! We know we will never have non-zero locations outside of the highest constrained sparsity power + call MatSetOption(cmat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) + call MatSetOption(cmat, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) + ! We know we are only going to insert local vals + ! These options should turn off any reductions in the assembly + call MatSetOption(cmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) - ! ! ~~~~~~~~~~~~ - ! ! If we're in parallel we need to get the off-process rows of matrix that correspond - ! ! to the columns of mat_sparsity_match - ! ! We can therefore do the matmult for every constrained power locally with just that data - ! ! ~~~~~~~~~~~~ - ! ! Have to double check comm_size /= 1 as we might be on a subcommunicator and we can't call - ! ! MatMPIAIJGetSeqAIJ specifically if that's the case - ! if (comm_size /= 1) then - - ! ! ~~~~ - ! ! Get the cols - ! ! ~~~~ - ! call MatMPIAIJGetSeqAIJ(mat_sparsity_match, Ad, Ao, colmap, ierr) - - ! call MatGetSize(Ad, rows_ad, cols_ad, ierr) - ! ! We know the col size of Ao is the size of colmap, the number of non-zero offprocessor columns - ! call MatGetSize(Ao, rows_ao, cols_ao, ierr) - - ! ! For the column indices we need to take all the columns of mat_sparsity_match - ! A_array = mat_sparsity_match%v - - ! ! These are the global indices of the columns we want - ! allocate(col_indices_off_proc_array(cols_ad + cols_ao)) - ! allocate(ad_indices(cols_ad)) - ! ! Local rows (as global indices) - ! do ifree = 1, cols_ad - ! ad_indices(ifree) = global_row_start + ifree - 1 - ! end do - - ! ! col_indices_off_proc_array is now sorted, which are the global indices of the columns we want - ! call merge_pre_sorted(ad_indices, colmap, col_indices_off_proc_array) - ! deallocate(ad_indices) - - ! ! Create the sequential IS we want with the cols we want (written as global indices) - ! call ISCreateGeneral(PETSC_COMM_SELF, cols_ad + cols_ao, & - ! col_indices_off_proc_array, PETSC_USE_POINTER, col_indices(1), ierr) - ! call ISCreateGeneral(PETSC_COMM_SELF, cols_ao, & - ! colmap, PETSC_USE_POINTER, row_indices(1), ierr) - - ! ! ~~~~~~~ - ! ! Now we can pull out the chunk of matrix that we need - ! ! ~~~~~~~ - - ! ! We need off-processor rows to compute matrix powers - ! ! Setting this is necessary to avoid an allreduce when calling createsubmatrices - ! ! This will be reset to false after the call to createsubmatrices - ! call MatSetOption(matrix, MAT_SUBMAT_SINGLEIS, PETSC_TRUE, ierr) + ! ~~~~~~~~~~~~ + ! If we're in parallel we need to get the off-process rows of matrix that correspond + ! to the columns of mat_sparsity_match + ! We can therefore do the matmult for every constrained power locally with just that data + ! ~~~~~~~~~~~~ + ! Have to double check comm_size /= 1 as we might be on a subcommunicator and we can't call + ! MatMPIAIJGetSeqAIJ specifically if that's the case + if (comm_size /= 1) then + + ! ~~~~ + ! Get the cols + ! ~~~~ + call MatMPIAIJGetSeqAIJ(mat_sparsity_match, Ad, Ao, colmap, ierr) + + call MatGetSize(Ad, rows_ad, cols_ad, ierr) + ! We know the col size of Ao is the size of colmap, the number of non-zero offprocessor columns + call MatGetSize(Ao, rows_ao, cols_ao, ierr) + + ! For the column indices we need to take all the columns of mat_sparsity_match + A_array = mat_sparsity_match%v + + ! These are the global indices of the columns we want + allocate(col_indices_off_proc_array(cols_ad + cols_ao)) + allocate(ad_indices(cols_ad)) + ! Local rows (as global indices) + do ifree = 1, cols_ad + ad_indices(ifree) = global_row_start + ifree - 1 + end do + + ! col_indices_off_proc_array is now sorted, which are the global indices of the columns we want + call merge_pre_sorted(ad_indices, colmap, col_indices_off_proc_array) + deallocate(ad_indices) + + ! Create the sequential IS we want with the cols we want (written as global indices) + call ISCreateGeneral(PETSC_COMM_SELF, cols_ad + cols_ao, & + col_indices_off_proc_array, PETSC_USE_POINTER, col_indices(1), ierr) + call ISCreateGeneral(PETSC_COMM_SELF, cols_ao, & + colmap, PETSC_USE_POINTER, row_indices(1), ierr) + + ! ~~~~~~~ + ! Now we can pull out the chunk of matrix that we need + ! ~~~~~~~ + + ! We need off-processor rows to compute matrix powers + ! Setting this is necessary to avoid an allreduce when calling createsubmatrices + ! This will be reset to false after the call to createsubmatrices + call MatSetOption(matrix, MAT_SUBMAT_SINGLEIS, PETSC_TRUE, ierr) - ! ! Now this will be doing comms to get the non-local rows we want - ! ! But only including the columns of the local fixed sparsity, as we don't need all the - ! ! columns of the non-local entries unless we are doing a full matmatmult - ! ! This returns a sequential matrix - ! if (.NOT. PetscObjectIsNull(reuse_mat)) then - ! reuse_submatrices(1) = reuse_mat - ! call MatCreateSubMatrices(matrix, one, row_indices, col_indices, MAT_REUSE_MATRIX, reuse_submatrices, ierr) - ! else - ! call MatCreateSubMatrices(matrix, one, row_indices, col_indices, MAT_INITIAL_MATRIX, reuse_submatrices, ierr) - ! reuse_mat = reuse_submatrices(1) - ! end if - ! row_size = size(col_indices_off_proc_array) - ! call ISDestroy(col_indices(1), ierr) - ! call ISDestroy(row_indices(1), ierr) - - ! ! Easy in serial as we have everything we neeed - ! else - - ! Ad = mat_sparsity_match - ! cols_ad = local_cols - ! allocate(reuse_submatrices(1)) - ! deallocate_submatrices = .TRUE. - ! reuse_submatrices(1) = matrix - ! row_size = local_rows - ! allocate(col_indices_off_proc_array(local_rows)) - ! do ifree = 1, local_rows - ! col_indices_off_proc_array(ifree) = ifree-1 - ! end do - ! end if + ! Now this will be doing comms to get the non-local rows we want + ! But only including the columns of the local fixed sparsity, as we don't need all the + ! columns of the non-local entries unless we are doing a full matmatmult + ! This returns a sequential matrix + if (.NOT. PetscObjectIsNull(reuse_mat)) then + reuse_submatrices(1) = reuse_mat + call MatCreateSubMatrices(matrix, one, row_indices, col_indices, MAT_REUSE_MATRIX, reuse_submatrices, ierr) + else + call MatCreateSubMatrices(matrix, one, row_indices, col_indices, MAT_INITIAL_MATRIX, reuse_submatrices, ierr) + reuse_mat = reuse_submatrices(1) + end if + row_size = size(col_indices_off_proc_array) + call ISDestroy(col_indices(1), ierr) + call ISDestroy(row_indices(1), ierr) + + ! Easy in serial as we have everything we neeed + else + + Ad = mat_sparsity_match + cols_ad = local_cols + allocate(reuse_submatrices(1)) + deallocate_submatrices = .TRUE. + reuse_submatrices(1) = matrix + row_size = local_rows + allocate(col_indices_off_proc_array(local_rows)) + do ifree = 1, local_rows + col_indices_off_proc_array(ifree) = ifree-1 + end do + end if - ! ! ~~~~~~~~~ - ! ! Now that we are here, reuse_submatrices(1) contains A^poly_sparsity_order with all of the rows - ! ! that correspond to the non-zero columns of matrix - ! ! ~~~~~~~~~ - - ! ! Have to get the max nnzs of the local and off-local rows we've just retrieved - ! max_nnzs = 0 - ! do ifree = global_row_start, global_row_end_plus_one-1 - ! call MatGetRow(matrix, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) - ! if (ncols > max_nnzs) max_nnzs = ncols - ! call MatRestoreRow(matrix, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) - ! end do - ! if (comm_size /= 1) then - ! do ifree = 1, cols_ao - ! call MatGetRow(reuse_submatrices(1), ifree-1, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) - ! if (ncols > max_nnzs) max_nnzs = ncols - ! call MatRestoreRow(reuse_submatrices(1), ifree-1, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) - ! end do - ! end if - ! ! and also the sparsity power - ! do ifree = global_row_start, global_row_end_plus_one-1 - ! call MatGetRow(mat_sparsity_match, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) - ! if (ncols > max_nnzs) max_nnzs = ncols - ! call MatRestoreRow(mat_sparsity_match, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) - ! end do + ! ~~~~~~~~~ + ! Now that we are here, reuse_submatrices(1) contains A^poly_sparsity_order with all of the rows + ! that correspond to the non-zero columns of matrix + ! ~~~~~~~~~ + + ! Have to get the max nnzs of the local and off-local rows we've just retrieved + max_nnzs = 0 + do ifree = global_row_start, global_row_end_plus_one-1 + call MatGetRow(matrix, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + if (ncols > max_nnzs) max_nnzs = ncols + call MatRestoreRow(matrix, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + end do + if (comm_size /= 1) then + do ifree = 1, cols_ao + call MatGetRow(reuse_submatrices(1), ifree-1, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + if (ncols > max_nnzs) max_nnzs = ncols + call MatRestoreRow(reuse_submatrices(1), ifree-1, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + end do + end if + ! and also the sparsity power + do ifree = global_row_start, global_row_end_plus_one-1 + call MatGetRow(mat_sparsity_match, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + if (ncols > max_nnzs) max_nnzs = ncols + call MatRestoreRow(mat_sparsity_match, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + end do - ! ! ~~~~~~~~ - ! ! Get pointers to the sequential aij structure so we don't have to put critical regions - ! ! around the matgetrow - ! ! ~~~~~~~~ - ! call MatGetRowIJ(reuse_submatrices(1),shift,symmetric,inodecompressed,n,submatrices_ia,submatrices_ja,done,ierr) - ! if (.NOT. done) then - ! print *, "Pointers not set in call to MatGetRowIJF" - ! call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) - ! end if - ! ! Returns the wrong size pointer and can break if that size goes negative?? - ! !call MatSeqAIJGetArrayF90(reuse_submatrices(1),submatrices_vals,ierr); - ! A_array = reuse_submatrices(1)%v - ! ! Now we must never overwrite the values in this pointer, and we must - ! ! never call restore on it, see comment on top of the commented out - ! ! MatSeqAIJRestoreArray below - ! call MatSeqAIJGetArrayF90_mine(A_array, vals_c_ptr) - ! call c_f_pointer(vals_c_ptr, submatrices_vals, shape=[size(submatrices_ja)]) + ! ~~~~~~~~ + ! Get pointers to the sequential aij structure so we don't have to put critical regions + ! around the matgetrow + ! ~~~~~~~~ + call MatGetRowIJ(reuse_submatrices(1),shift,symmetric,inodecompressed,n,submatrices_ia,submatrices_ja,done,ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJF" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + ! Returns the wrong size pointer and can break if that size goes negative?? + !call MatSeqAIJGetArrayF90(reuse_submatrices(1),submatrices_vals,ierr); + A_array = reuse_submatrices(1)%v + ! Now we must never overwrite the values in this pointer, and we must + ! never call restore on it, see comment on top of the commented out + ! MatSeqAIJRestoreArray below + call MatSeqAIJGetArrayF90_mine(A_array, vals_c_ptr) + call c_f_pointer(vals_c_ptr, submatrices_vals, shape=[size(submatrices_ja)]) - ! ! ~~~~~~~~~~ + ! ~~~~~~~~~~ - ! allocate(cols(max_nnzs)) - ! allocate(vals(max_nnzs)) - ! allocate(vals_power_temp(max_nnzs)) - ! allocate(vals_previous_power_temp(max_nnzs)) - ! allocate(cols_index_one(max_nnzs)) - ! allocate(cols_index_two(max_nnzs)) - - ! ! Scale the highest constrained power - ! call MatScale(cmat, coefficients(poly_sparsity_order+1), ierr) - - ! ! Then go backwards and add in each of the coefficients * A^order from the second highest order down - ! do order = poly_sparsity_order - 1, 1, -1 - - ! ! Do result = alpha_1 * A_ff + alpha_2 * A_ff^2 + .... - ! ! Can use SUBSET_NONZERO_PATTERN as we have put the highest order power in first - ! call MatAXPY(cmat, coefficients(order+1), matrix_powers(order), SUBSET_NONZERO_PATTERN , ierr) - ! end do - - ! ! Add in the 0th order term - ! do i_loc = 1, local_rows - - ! ! Add in the I term - 0th order term - ! call MatSetValue(cmat, global_row_start + i_loc-1, global_row_start + i_loc-1, & - ! coefficients(1), ADD_VALUES, ierr) - ! end do - - ! ! ~~~~~~~~~~~~ - ! ! From here we now have cmat with the correct values up to the power poly_sparsity_order - ! ! and hence we want to add in the sparsity constrained powers - ! ! ~~~~~~~~~~~~ + allocate(cols(max_nnzs)) + allocate(vals(max_nnzs)) + allocate(vals_power_temp(max_nnzs)) + allocate(vals_previous_power_temp(max_nnzs)) + allocate(cols_index_one(max_nnzs)) + allocate(cols_index_two(max_nnzs)) + + ! ~~~~~~~~~~~~ + ! From here we now have cmat with the correct values up to the power poly_sparsity_order + ! and hence we want to add in the sparsity constrained powers + ! ~~~~~~~~~~~~ - ! ! Now go through and compute the sum of the matrix powers - ! ! We're doing row-wise matmatmults here assuming the fixed sparsity - ! ! We exploit the fact that the subsequent matrix powers can be done - ! ! one row at a time, so we only have to retrieve the needed vals from mat_sparsity_match once - ! do i_loc = 1, local_rows + ! Now go through and compute the sum of the matrix powers + ! We're doing row-wise matmatmults here assuming the fixed sparsity + ! We exploit the fact that the subsequent matrix powers can be done + ! one row at a time, so we only have to retrieve the needed vals from mat_sparsity_match once + do i_loc = 1, local_rows - ! ! Get the row of mat_sparsity_match - ! call MatGetRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, & - ! cols_ptr, vals_ptr, ierr) - ! ! Copying here because mat_sparsity_match and matrix are often the same matrix - ! ! and hence we can only have one active matgetrow - ! ncols = ncols_two - ! cols(1:ncols) = cols_ptr(1:ncols) - ! vals(1:ncols) = vals_ptr(1:ncols) - ! call MatRestoreRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, & - ! cols_ptr, vals_ptr, ierr) - - ! ! This is just a symbolic for the set of rows given in cols - ! ! Let's just do all the column matching and extraction of the values once + ! Get the row of mat_sparsity_match + call MatGetRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, & + cols_ptr, vals_ptr, ierr) + ! Copying here because mat_sparsity_match and matrix are often the same matrix + ! and hence we can only have one active matgetrow + ncols = ncols_two + cols(1:ncols) = cols_ptr(1:ncols) + vals(1:ncols) = vals_ptr(1:ncols) + call MatRestoreRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, & + cols_ptr, vals_ptr, ierr) + + ! This is just a symbolic for the set of rows given in cols + ! Let's just do all the column matching and extraction of the values once - ! ! Allocate some space to store the matching indices - ! allocate(symbolic_ones(ncols)) - ! allocate(symbolic_vals(ncols)) - ! row_index_into_submatrix = 1 - - ! ! This is a row-wise product - ! do j_loc = 1, ncols - - ! ! If we're trying to access a local row in matrix - ! if (cols(j_loc) .ge. global_row_start .AND. cols(j_loc) < global_row_end_plus_one) then - - ! call MatGetRow(matrix, cols(j_loc), ncols_two, & - ! cols_two_ptr, vals_two_ptr, ierr) - - ! ! If we're trying to access a non-local row in matrix - ! else - - ! ! this is local row index we want into reuse_submatrices(1) (as row_indices used to extract are just colmap) - ! ! We know cols is sorted, so every non-local index will be greater than the last one - ! ! (it's just that cols could have some local ones between different non-local) - ! ! colmap is also sorted and we know every single non-local entry in cols(j_loc) is in colmap - ! do while (row_index_into_submatrix .le. cols_ao .AND. colmap(row_index_into_submatrix) .lt. cols(j_loc)) - ! row_index_into_submatrix = row_index_into_submatrix + 1 - ! end do - - ! ! This is the number of columns - ! ncols_two = submatrices_ia(row_index_into_submatrix+1) - submatrices_ia(row_index_into_submatrix) - ! allocate(cols_two_ptr(ncols_two)) - ! ! This is the local column indices in reuse_submatrices(1) - ! cols_two_ptr = submatrices_ja(submatrices_ia(row_index_into_submatrix)+1:submatrices_ia(row_index_into_submatrix+1)) - ! ! Because col_indices_off_proc_array (and hence the column indices in reuse_submatrices(1) is sorted, - ! ! then cols_two_ptr contains the sorted global column indices - ! cols_two_ptr = col_indices_off_proc_array(cols_two_ptr+1) - - ! ! This is the values - ! vals_two_ptr => & - ! submatrices_vals(submatrices_ia(row_index_into_submatrix)+1:submatrices_ia(row_index_into_submatrix+1)) - ! end if + ! Allocate some space to store the matching indices + allocate(symbolic_ones(ncols)) + allocate(symbolic_vals(ncols)) + row_index_into_submatrix = 1 + + ! This is a row-wise product + do j_loc = 1, ncols + + ! If we're trying to access a local row in matrix + if (cols(j_loc) .ge. global_row_start .AND. cols(j_loc) < global_row_end_plus_one) then + + call MatGetRow(matrix, cols(j_loc), ncols_two, & + cols_two_ptr, vals_two_ptr, ierr) + + ! If we're trying to access a non-local row in matrix + else + + ! this is local row index we want into reuse_submatrices(1) (as row_indices used to extract are just colmap) + ! We know cols is sorted, so every non-local index will be greater than the last one + ! (it's just that cols could have some local ones between different non-local) + ! colmap is also sorted and we know every single non-local entry in cols(j_loc) is in colmap + do while (row_index_into_submatrix .le. cols_ao .AND. colmap(row_index_into_submatrix) .lt. cols(j_loc)) + row_index_into_submatrix = row_index_into_submatrix + 1 + end do + + ! This is the number of columns + ncols_two = submatrices_ia(row_index_into_submatrix+1) - submatrices_ia(row_index_into_submatrix) + allocate(cols_two_ptr(ncols_two)) + ! This is the local column indices in reuse_submatrices(1) + cols_two_ptr = submatrices_ja(submatrices_ia(row_index_into_submatrix)+1:submatrices_ia(row_index_into_submatrix+1)) + ! Because col_indices_off_proc_array (and hence the column indices in reuse_submatrices(1) is sorted, + ! then cols_two_ptr contains the sorted global column indices + cols_two_ptr = col_indices_off_proc_array(cols_two_ptr+1) + + ! This is the values + vals_two_ptr => & + submatrices_vals(submatrices_ia(row_index_into_submatrix)+1:submatrices_ia(row_index_into_submatrix+1)) + end if - ! ! Search for the matching column - ! ! We're intersecting the global column indices of mat_sparsity_match (cols) and matrix (cols_two_ptr) - ! call intersect_pre_sorted_indices_only(cols(1:ncols), cols_two_ptr, cols_index_one, cols_index_two, match_counter) + ! Search for the matching column + ! We're intersecting the global column indices of mat_sparsity_match (cols) and matrix (cols_two_ptr) + call intersect_pre_sorted_indices_only(cols(1:ncols), cols_two_ptr, cols_index_one, cols_index_two, match_counter) - ! ! Don't need to do anything if we have no matches - ! if (match_counter == 0) then - ! ! Store that we can skip this entry - ! symbolic_ones(j_loc)%ptr => null() - ! symbolic_vals(j_loc)%ptr => null() - ! else - - ! ! These are the matching local column indices for this row of mat_sparsity_match - ! allocate(symbolic_ones(j_loc)%ptr(match_counter)) - ! symbolic_ones(j_loc)%ptr = cols_index_one(1:match_counter) - - ! ! These are the matching values of matrix - ! allocate(symbolic_vals(j_loc)%ptr(match_counter)) - ! symbolic_vals(j_loc)%ptr = vals_two_ptr(cols_index_two(1:match_counter)) - ! end if + ! Don't need to do anything if we have no matches + if (match_counter == 0) then + ! Store that we can skip this entry + symbolic_ones(j_loc)%ptr => null() + symbolic_vals(j_loc)%ptr => null() + else + + ! These are the matching local column indices for this row of mat_sparsity_match + allocate(symbolic_ones(j_loc)%ptr(match_counter)) + symbolic_ones(j_loc)%ptr = cols_index_one(1:match_counter) + + ! These are the matching values of matrix + allocate(symbolic_vals(j_loc)%ptr(match_counter)) + symbolic_vals(j_loc)%ptr = vals_two_ptr(cols_index_two(1:match_counter)) + end if - ! ! Restore local row of matrix - ! if (cols(j_loc) .ge. global_row_start .AND. cols(j_loc) < global_row_end_plus_one) then - ! call MatRestoreRow(matrix, cols(j_loc), ncols_two, & - ! cols_two_ptr, vals_two_ptr, ierr) - ! else - ! deallocate(cols_two_ptr) - ! end if - ! end do + ! Restore local row of matrix + if (cols(j_loc) .ge. global_row_start .AND. cols(j_loc) < global_row_end_plus_one) then + call MatRestoreRow(matrix, cols(j_loc), ncols_two, & + cols_two_ptr, vals_two_ptr, ierr) + else + deallocate(cols_two_ptr) + end if + end do - ! ! Start with the values of mat_sparsity_match in it - ! vals_previous_power_temp(1:ncols) = vals(1:ncols) + ! Start with the values of mat_sparsity_match in it + vals_previous_power_temp(1:ncols) = vals(1:ncols) - ! ! Loop over any matrix powers - ! ! vals_power_temp stores the value of A^(term-1) for this row, and we update this as we go through - ! ! the term loop - ! do term = poly_sparsity_order+2, size(coefficients) + ! ! Loop over any matrix powers + ! ! vals_power_temp stores the value of A^(term-1) for this row, and we update this as we go through + ! ! the term loop + ! do term = poly_sparsity_order+2, size(coefficients) - ! ! We need to sum up the product of vals_previous_power_temp(j_loc) * matching columns - ! vals_power_temp(1:ncols) = 0 + ! ! We need to sum up the product of vals_previous_power_temp(j_loc) * matching columns + ! vals_power_temp(1:ncols) = 0 - ! ! Have to finish all the columns before we move onto the next coefficient - ! do j_loc = 1, ncols + ! ! Have to finish all the columns before we move onto the next coefficient + ! do j_loc = 1, ncols - ! ! If we have no matching columns cycle this row - ! if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle + ! ! If we have no matching columns cycle this row + ! if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle - ! ! symbolic_vals(j_loc)%ptr has the matching values of A in it - ! vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) + & - ! vals_previous_power_temp(j_loc) * symbolic_vals(j_loc)%ptr + ! ! symbolic_vals(j_loc)%ptr has the matching values of A in it + ! vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) + & + ! vals_previous_power_temp(j_loc) * symbolic_vals(j_loc)%ptr - ! end do + ! end do - ! ! ~~~~~~~~~~~ - ! ! Now can add the value of coeff * A^(term-1) to our matrix - ! ! Can skip this if coeff is zero, but still need to compute A^(term-1) - ! ! for the next time through - ! ! ~~~~~~~~~~~ - ! if (ncols /= 0 .AND. coefficients(term) /= 0d0) then - ! call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & - ! coefficients(term) * vals_power_temp, ADD_VALUES, ierr) - ! end if - - ! ! This should now have the value of A^(term-1) in it - ! vals_previous_power_temp(1:ncols) = vals_power_temp(1:ncols) - ! end do - - ! ! Delete our symbolic - ! do j_loc = 1, ncols - ! if (associated(symbolic_ones(j_loc)%ptr)) then - ! deallocate(symbolic_ones(j_loc)%ptr) - ! deallocate(symbolic_vals(j_loc)%ptr) - ! end if - ! end do - ! deallocate(symbolic_vals, symbolic_ones) - ! end do - - ! call MatRestoreRowIJ(reuse_submatrices(1),shift,symmetric,inodecompressed,n,submatrices_ia,submatrices_ja,done,ierr) - ! ! We very deliberately don't call restorearray here! - ! ! There is no matseqaijgetarrayread or matseqaijrestorearrayread in Fortran - ! ! Those routines don't increment the PetscObjectStateGet which tells petsc - ! ! the mat has changed. Hence above we directly access the data pointer with - ! ! a call to MatSeqAIJGetArrayF90_mine and then never write into it - ! ! If we call the restorearrayf90, that does increment the object state - ! ! even though we only read from the array - ! ! That would mean if we pass in a pc->pmat for example, just setting up a pc - ! ! would trigger petsc setting up the pc on every iteration of the pc - ! ! call MatSeqAIJRestoreArray(reuse_submatrices(1),submatrices_vals,ierr); - - ! ! ~~~~~~~~~~~ - - ! ! Do the assembly, should need zero reductions in this given we've set the - ! ! flags above - ! call MatAssemblyBegin(cmat, MAT_FINAL_ASSEMBLY, ierr) - - ! ! Delete temporaries - ! do order = 2, poly_sparsity_order - ! call MatDestroy(matrix_powers(order), ierr) - ! end do - ! if (deallocate_submatrices) then - ! deallocate(reuse_submatrices) - ! reuse_submatrices => null() - ! end if - - ! deallocate(col_indices_off_proc_array) - ! deallocate(cols, vals, vals_power_temp, vals_previous_power_temp, cols_index_one, cols_index_two) - - ! ! Finish assembly - ! call MatAssemblyEnd(cmat, MAT_FINAL_ASSEMBLY, ierr) + ! ! ~~~~~~~~~~~ + ! ! Now can add the value of coeff * A^(term-1) to our matrix + ! ! Can skip this if coeff is zero, but still need to compute A^(term-1) + ! ! for the next time through + ! ! ~~~~~~~~~~~ + ! if (ncols /= 0 .AND. coefficients(term) /= 0d0) then + ! call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & + ! coefficients(term) * vals_power_temp, ADD_VALUES, ierr) + ! end if + + ! ! This should now have the value of A^(term-1) in it + ! vals_previous_power_temp(1:ncols) = vals_power_temp(1:ncols) + ! end do + + ! Delete our symbolic + do j_loc = 1, ncols + if (associated(symbolic_ones(j_loc)%ptr)) then + deallocate(symbolic_ones(j_loc)%ptr) + deallocate(symbolic_vals(j_loc)%ptr) + end if + end do + deallocate(symbolic_vals, symbolic_ones) + end do + + call MatRestoreRowIJ(reuse_submatrices(1),shift,symmetric,inodecompressed,n,submatrices_ia,submatrices_ja,done,ierr) + ! We very deliberately don't call restorearray here! + ! There is no matseqaijgetarrayread or matseqaijrestorearrayread in Fortran + ! Those routines don't increment the PetscObjectStateGet which tells petsc + ! the mat has changed. Hence above we directly access the data pointer with + ! a call to MatSeqAIJGetArrayF90_mine and then never write into it + ! If we call the restorearrayf90, that does increment the object state + ! even though we only read from the array + ! That would mean if we pass in a pc->pmat for example, just setting up a pc + ! would trigger petsc setting up the pc on every iteration of the pc + ! call MatSeqAIJRestoreArray(reuse_submatrices(1),submatrices_vals,ierr); + + ! ~~~~~~~~~~~ + + ! Do the assembly, should need zero reductions in this given we've set the + ! flags above + call MatAssemblyBegin(cmat, MAT_FINAL_ASSEMBLY, ierr) + + ! Delete temporaries + call MatDestroy(mat_sparsity_match, ierr) + if (deallocate_submatrices) then + deallocate(reuse_submatrices) + reuse_submatrices => null() + end if + + deallocate(col_indices_off_proc_array) + deallocate(cols, vals, vals_power_temp, vals_previous_power_temp, cols_index_one, cols_index_two) + + ! Finish assembly + call MatAssemblyEnd(cmat, MAT_FINAL_ASSEMBLY, ierr) end subroutine mat_mult_powers_share_sparsity_newton_cpu @@ -1246,13 +1245,14 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! Local variables PetscInt :: global_rows, global_cols, local_rows, local_cols - integer :: comm_size, errorcode, order + integer :: comm_size, errorcode PetscErrorCode :: ierr MPIU_Comm :: MPI_COMM_MATRIX type(mat_ctxtype), pointer :: mat_ctx=>null() logical :: reuse_triggered PetscReal :: square_sum type(tMat) :: mat_product, temp_mat_A, temp_mat_two, temp_mat_three, mat_product_k_plus_1 + logical :: reuse_triggered ! ~~~~~~ @@ -1314,10 +1314,11 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & return endif + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + ! ~~~~~~~~~~~~ ! If we're here then we want an assembled approximate inverse ! ~~~~~~~~~~~~ - reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) ! If we're zeroth order poly this is trivial as it's just 1/theta_1 I if (poly_order == 0) then @@ -1330,66 +1331,12 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! For poly_order 1 and poly_sparsity_order 1 this is easy else if (poly_order == 1 .AND. poly_sparsity_order == 1) then - - ! Duplicate & copy the matrix, but ensure there is a diagonal present - call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, inv_matrix) - - ! Flags to prevent reductions when assembling (there are assembles in the shift) - call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) - call MatSetOption(inv_matrix, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) - call MatSetOption(inv_matrix, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) - - ! We only have two coefficients, so they are either both real or complex conjugates - ! If real - if (coefficients(1,2) == 0d0) then - - ! Have to be careful here, as we may be first order, but the second eigenvaule - ! might have been set to zero thanks to the rank reducing solve - ! So we just check if the second real part is zero and if it is - ! we just compute a 0th order inverse - annoyingly we can't call - ! build_gmres_polynomial_newton_inverse_0th_order as that builds a MATDIAGONAL - ! and in the tests there is a problem where we reuse the sparsity, in the first - ! solve we don't have a zero coefficient but in the second solve we do - ! So the mat type needs to remain consistent - ! This can't happen in the complex case - if (coefficients(2,1) == 0d0) then - - ! Set to zero - call MatScale(inv_matrix, 0d0, ierr) - ! Then add in the 0th order inverse - call MatShift(inv_matrix, 1d0/coefficients(1,1), ierr) - - ! Then just return - return - end if - - ! Could just compute the equivalent mononomial here to save some flops - ! but the whole point to doing the Newton form is to avoid the - ! theta_1 * theta_2 that would result - - ! result = -A_ff/theta_1 - call MatScale(inv_matrix, -1d0/(coefficients(1, 1)), ierr) - ! result = I -A_ff/theta_1 - call MatShift(inv_matrix, 1d0, ierr) - ! result = 1/theta_2 * (I -A_ff/theta_1) - call MatScale(inv_matrix, 1d0/(coefficients(2, 1)), ierr) - - ! result = 1/theta_1 + 1/theta_2 * (I -A_ff/theta_1) - ! Don't need an assemble as there is one called in this - call MatShift(inv_matrix, 1d0/(coefficients(1, 1)), ierr) - ! Complex conjugate roots, a +- ib - else - ! a^2 + b^2 - square_sum = coefficients(1,1)**2 + coefficients(1,2)**2 - - ! Complex conjugate roots - ! result = -A_ff / (a^2 + b^2) - call MatScale(inv_matrix, -1d0/square_sum, ierr) - ! result = 2a/(a^2 + b^2) I - A_ff / (a^2 + b^2) - ! Don't need an assemble as there is one called in this - call MatShift(inv_matrix, 2d0 * coefficients(1,1)/square_sum, ierr) - end if + ! Duplicate & copy the matrix, but ensure there is a diagonal present + call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, inv_matrix) + + call build_gmres_polynomial_newton_inverse_1st_1st(matrix, & + poly_order, coefficients, inv_matrix) ! Then just return return @@ -1412,148 +1359,9 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! ~~~~~~~~~~ ! We are only here if we don't constrain_sparsity ! ~~~~~~~~~~ - - ! If not re-using - ! Copy in the initial matrix - if (.NOT. reuse_triggered) then - ! Duplicate & copy the matrix, but ensure there is a diagonal present - call mat_duplicate_copy_plus_diag(matrix, .FALSE., inv_matrix) - else - ! For the powers > 1 the pattern of the original matrix will be different - ! to the resulting inverse - call MatCopy(matrix, inv_matrix, DIFFERENT_NONZERO_PATTERN, ierr) - end if - - ! Set to zero as we add in each product of terms - call MatScale(inv_matrix, 0d0, ierr) - - ! Don't set any off processor entries so no need for a reduction when assembling - call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) - - ! We start with an identity in mat_product - call generate_identity(matrix, mat_product) - - ! ~~~~~~~~~~~~ - ! Iterate over the order - ! This is basically the same as the MF application but we have to build the powers - ! ~~~~~~~~~~~~ - order = 1 - do while (order .le. size(coefficients, 1) - 1) - - ! Duplicate & copy the matrix, but ensure there is a diagonal present - ! temp_mat_A is going to store things with the sparsity of A - if (PetscObjectIsNull(temp_mat_A)) then - call mat_duplicate_copy_plus_diag(matrix, .FALSE., temp_mat_A) - else - ! Can reuse the sparsity - call mat_duplicate_copy_plus_diag(matrix, .TRUE., temp_mat_A) - end if - - ! If real this is easy - if (coefficients(order,2) == 0d0) then - - ! Skips eigenvalues that are numerically zero - see - ! the comment in calculate_gmres_polynomial_roots_newton - if (abs(coefficients(order,1)) < 1e-12) then - order = order + 1 - cycle - end if - - ! Then add the scaled version of each product - if (reuse_triggered) then - ! If doing reuse we know our nonzeros are a subset - call MatAXPY(inv_matrix, 1d0/coefficients(order,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) - else - ! Have to use the DIFFERENT_NONZERO_PATTERN here - call MatAXPYWrapper(inv_matrix, 1d0/coefficients(order,1), mat_product) - end if - - ! temp_mat_A = A_ff/theta_k - call MatScale(temp_mat_A, -1d0/coefficients(order,1), ierr) - ! temp_mat_A = I - A_ff/theta_k - call MatShift(temp_mat_A, 1d0, ierr) - - ! mat_product_k_plus_1 = mat_product * temp_mat_A - call MatMatMult(temp_mat_A, mat_product, & - MAT_INITIAL_MATRIX, 1.5d0, mat_product_k_plus_1, ierr) - call MatDestroy(mat_product, ierr) - mat_product = mat_product_k_plus_1 - - order = order + 1 - - ! Complex - else - - ! Skips eigenvalues that are numerically zero - if (coefficients(order,1)**2 + coefficients(order,2)**2 < 1e-12) then - order = order + 2 - cycle - end if - - ! Compute 2a I - A - ! Have to use the DIFFERENT_NONZERO_PATTERN here - ! temp_mat_A = -A - call MatScale(temp_mat_A, -1d0, ierr) - ! temp_mat_A = 2a I - A_ff - call MatShift(temp_mat_A, 2d0 * coefficients(order,1), ierr) - ! temp_mat_A = (2a I - A_ff)/(a^2 + b^2) - call MatScale(temp_mat_A, 1d0/(coefficients(order,1)**2 + coefficients(order,2)**2), ierr) - - call MatMatMult(temp_mat_A, mat_product, & - MAT_INITIAL_MATRIX, 1.5d0, temp_mat_two, ierr) - - ! Then add the scaled version of each product - if (reuse_triggered) then - ! If doing reuse we know our nonzeros are a subset - call MatAXPY(inv_matrix, 1d0, temp_mat_two, SUBSET_NONZERO_PATTERN, ierr) - else - ! Have to use the DIFFERENT_NONZERO_PATTERN here - call MatAXPYWrapper(inv_matrix, 1d0, temp_mat_two) - end if - - if (order .le. size(coefficients, 1) - 2) then - ! temp_mat_three = matrix * temp_mat_two - call MatMatMult(matrix, temp_mat_two, & - MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr) - call MatDestroy(temp_mat_two, ierr) - - ! Then add the scaled version of each product - if (reuse_triggered) then - ! If doing reuse we know our nonzeros are a subset - call MatAXPY(mat_product, -1d0, temp_mat_three, SUBSET_NONZERO_PATTERN, ierr) - else - ! Have to use the DIFFERENT_NONZERO_PATTERN here - call MatAXPYWrapper(mat_product, -1d0, temp_mat_three) - end if - call MatDestroy(temp_mat_three, ierr) - else - call MatDestroy(temp_mat_two, ierr) - end if - - ! Skip two evals - order = order + 2 - - end if - end do - - ! Final step if last root is real - if (coefficients(size(coefficients,1),2) == 0d0) then - ! Add in the final term multiplied by 1/theta_poly_order - - ! Skips eigenvalues that are numerically zero - if (abs(coefficients(order,1)) > 1e-12) then - if (reuse_triggered) then - ! If doing reuse we know our nonzeros are a subset - call MatAXPY(inv_matrix, 1d0/coefficients(order,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) - else - ! Have to use the DIFFERENT_NONZERO_PATTERN here - call MatAXPYWrapper(inv_matrix, 1d0/coefficients(order,1), mat_product) - end if - end if - end if - - call MatDestroy(temp_mat_A, ierr) - call MatDestroy(mat_product, ierr) + call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & + inv_matrix) + end subroutine build_gmres_polynomial_newton_inverse @@ -1621,7 +1429,7 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly type(tMat), intent(inout) :: inv_matrix ! Local variables - integer :: order + integer :: i PetscErrorCode :: ierr logical :: reuse_triggered type(tVec) :: inv_vec, diag_vec, product_vec, temp_vec_A, one_vec, temp_vec_two @@ -1633,7 +1441,6 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly call MatCreateVecs(matrix, product_vec, diag_vec, ierr) call MatGetDiagonal(matrix, diag_vec, ierr) - ! This stores D^order if (.NOT. reuse_triggered) then call VecDuplicate(diag_vec, inv_vec, ierr) else @@ -1649,40 +1456,40 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly call VecSet(product_vec, 1d0, ierr) call VecSet(one_vec, 1d0, ierr) - order = 1 - do while (order .le. size(coefficients, 1) - 1) + i = 1 + do while (i .le. size(coefficients, 1) - 1) ! temp_vec_A is going to store things with the sparsity of A call VecCopy(diag_vec, temp_vec_A, ierr) ! If real this is easy - if (coefficients(order,2) == 0d0) then + if (coefficients(i,2) == 0d0) then ! Skips eigenvalues that are numerically zero - see ! the comment in calculate_gmres_polynomial_roots_newton - if (abs(coefficients(order,1)) < 1e-12) then - order = order + 1 + if (abs(coefficients(i,1)) < 1e-12) then + i = i + 1 cycle end if - call VecAXPY(inv_vec, 1d0/coefficients(order,1), product_vec, ierr) + call VecAXPY(inv_vec, 1d0/coefficients(i,1), product_vec, ierr) ! temp_vec_A = A_ff/theta_k - call VecScale(temp_vec_A, -1d0/coefficients(order,1), ierr) + call VecScale(temp_vec_A, -1d0/coefficients(i,1), ierr) ! temp_vec_A = I - A_ff/theta_k call VecAXPY(temp_vec_A, 1d0, one_vec, ierr) ! product_vec = product_vec * temp_vec_A call VecPointwiseMult(product_vec, product_vec, temp_vec_A, ierr) - order = order + 1 + i = i + 1 ! Complex else ! Skips eigenvalues that are numerically zero - if (coefficients(order,1)**2 + coefficients(order,2)**2 < 1e-12) then - order = order + 2 + if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then + i = i + 2 cycle end if @@ -1690,22 +1497,22 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly ! temp_vec_A = -A call VecScale(temp_vec_A, -1d0, ierr) ! temp_vec_A = 2a I - A_ff - call VecAXPY(temp_vec_A, 2d0 * coefficients(order,1), one_vec, ierr) + call VecAXPY(temp_vec_A, 2d0 * coefficients(i,1), one_vec, ierr) ! temp_vec_A = (2a I - A_ff)/(a^2 + b^2) - call VecScale(temp_vec_A, 1d0/(coefficients(order,1)**2 + coefficients(order,2)**2), ierr) + call VecScale(temp_vec_A, 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), ierr) ! temp_vec_two = temp_vec_A * product_vec call VecPointwiseMult(temp_vec_two, temp_vec_A, product_vec, ierr) call VecAXPY(inv_vec, 1d0, temp_vec_two, ierr) - if (order .le. size(coefficients, 1) - 2) then + if (i .le. size(coefficients, 1) - 2) then ! temp_vec_two = A * temp_vec_two call VecPointwiseMult(temp_vec_two, diag_vec, temp_vec_two, ierr) call VecAXPY(product_vec, -1d0, temp_vec_two, ierr) end if ! Skip two evals - order = order + 2 + i = i + 2 end if end do @@ -1715,8 +1522,8 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly ! Add in the final term multiplied by 1/theta_poly_order ! Skips eigenvalues that are numerically zero - if (abs(coefficients(order,1)) > 1e-12) then - call VecAXPY(inv_vec, 1d0/coefficients(order,1), product_vec, ierr) + if (abs(coefficients(i,1)) > 1e-12) then + call VecAXPY(inv_vec, 1d0/coefficients(i,1), product_vec, ierr) end if end if @@ -1737,6 +1544,374 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly end subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coefficients, & + inv_matrix, mat_product_output) + + ! Specific 1st order with 1st order sparsity + + ! ~~~~~~ + type(tMat), intent(in) :: matrix + integer, intent(in) :: poly_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: inv_matrix + type(tMat), intent(inout), optional :: mat_product_output + + ! Local variables + PetscErrorCode :: ierr + logical :: reuse_triggered, output_product + PetscReal :: square_sum + + ! ~~~~~~ + + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + output_product = present(mat_product_output) + + ! Flags to prevent reductions when assembling (there are assembles in the shift) + call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) + call MatSetOption(inv_matrix, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) + call MatSetOption(inv_matrix, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) + + ! We only have two coefficients, so they are either both real or complex conjugates + ! If real + if (coefficients(1,2) == 0d0) then + + ! Have to be careful here, as we may be first order, but the second eigenvaule + ! might have been set to zero thanks to the rank reducing solve + ! So we just check if the second real part is zero and if it is + ! we just compute a 0th order inverse - annoyingly we can't call + ! build_gmres_polynomial_newton_inverse_0th_order as that builds a MATDIAGONAL + ! and in the tests there is a problem where we reuse the sparsity, in the first + ! solve we don't have a zero coefficient but in the second solve we do + ! So the mat type needs to remain consistent + ! This can't happen in the complex case + if (coefficients(2,1) == 0d0) then + + ! Set to zero + call MatScale(inv_matrix, 0d0, ierr) + ! Then add in the 0th order inverse + call MatShift(inv_matrix, 1d0/coefficients(1,1), ierr) + + ! Then just return + return + end if + + ! Could just compute the equivalent mononomial here to save some flops + ! but the whole point to doing the Newton form is to avoid the + ! theta_1 * theta_2 that would result + + ! result = -A_ff/theta_1 + call MatScale(inv_matrix, -1d0/(coefficients(1, 1)), ierr) + ! result = I -A_ff/theta_1 + call MatShift(inv_matrix, 1d0, ierr) + ! If we're doing this as part of fixed sparsity multiply, + ! we need to return mat_product_output + if (output_product) then + call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_product_output, ierr) + end if + + ! result = 1/theta_2 * (I -A_ff/theta_1) + call MatScale(inv_matrix, 1d0/(coefficients(2, 1)), ierr) + + ! result = 1/theta_1 + 1/theta_2 * (I -A_ff/theta_1) + ! Don't need an assemble as there is one called in this + call MatShift(inv_matrix, 1d0/(coefficients(1, 1)), ierr) + + ! Complex conjugate roots, a +- ib + else + ! a^2 + b^2 + square_sum = coefficients(1,1)**2 + coefficients(1,2)**2 + + ! Complex conjugate roots + ! result = -A_ff / (a^2 + b^2) + call MatScale(inv_matrix, -1d0/square_sum, ierr) + ! result = 2a/(a^2 + b^2) I - A_ff / (a^2 + b^2) + ! Don't need an assemble as there is one called in this + call MatShift(inv_matrix, 2d0 * coefficients(1,1)/square_sum, ierr) + ! If we're doing this as part of fixed sparsity multiply, + ! we need to return mat_product_output + if (output_product) then + call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_product_output, ierr) + end if + end if + + end subroutine build_gmres_polynomial_newton_inverse_1st_1st + + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & + inv_matrix, mat_product_output, poly_sparsity_order, output_first_complex) + + ! No constrained sparsity by default + ! If you pass in mat_product_output, poly_sparsity_order, output_first_complex + ! then it will build part of the terms, up to poly_sparsity_order, and return the product + ! in mat_product_output that you need to compute the rest of the fixed sparsity terms + + ! ~~~~~~ + type(tMat), intent(in) :: matrix + integer, intent(in) :: poly_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: inv_matrix + type(tMat), intent(inout), optional :: mat_product_output + integer, intent(in), optional :: poly_sparsity_order + logical, intent(inout), optional :: output_first_complex + + ! Local variables + PetscErrorCode :: ierr + logical :: reuse_triggered, output_product, first_complex + integer :: i, i_sparse + type(tMat) :: mat_product, temp_mat_A, temp_mat_two, temp_mat_three, mat_product_k_plus_1 + + ! ~~~~~~ + + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + output_product = present(mat_product_output) + + if (.NOT. reuse_triggered) then + ! Duplicate & copy the matrix, but ensure there is a diagonal present + call mat_duplicate_copy_plus_diag(matrix, .FALSE., inv_matrix) + end if + + ! Set to zero as we add in each product of terms + call MatScale(inv_matrix, 0d0, ierr) + + ! Don't set any off processor entries so no need for a reduction when assembling + call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) + + ! We start with an identity in mat_product + call generate_identity(matrix, mat_product) + + ! If we're going to output the product as part of a fixed sparsity multiply, + ! we may be asking to constrain the sparsity to a power in between order and order + 2 + ! if there is a complex root at poly_sparsity_order + ! ie if we have roots (theta_1^r, theta_2^c, theta_3^c, theta_4^r) + ! where ^r means a purely real root and ^c means a complex root + ! want poly_sparsity_order = 1, we can't process all the way up to theta_3^c as that would + ! compute up to an A^2 term which is beyond our sparsity constraint + ! So we just check if the last root also has it's complex conjugate present + ! This will never happen in any context except when we are outputting the product + ! as part of a fixed sparsity multiply + + ! i_sparse tells us how many roots we are going to process + ! Normally this would just be size(coefficients, 1) and the loop below goes up + ! to size(coefficients, 1) - 1. The last real root gets its final term added outside the loop + ! and if the last root is complex then we only have to hit the first of the pair in the loop + ! + ! If we have fixed sparsity: + ! + ! if the fixed sparsity root is real then we want to set i_sparse to poly_sparsity_order+1 + ! so we hit the roots up to poly_sparsity_order in the loop and then we take care of the + ! poly_sparsity_order + 1 root outside the loop + ! + ! if the fixed sparsity root is complex but poly_sparsity_order + 1 hits the second of the pair + ! then we only need to set i_sparse to poly_sparsity_order + 1 so we only hit the first + ! pair in the loop below + ! + ! if the fixed sparsity root is complex but poly_sparsity_order + 1 hits the first of the pair + ! then we need to set i_sparse to poly_sparsity_order + 2 + ! otherwise we would never hit the first pair + + i_sparse = size(coefficients, 1) + first_complex = .FALSE. + + if (output_product) then + output_first_complex = .FALSE. + if (output_product) then + i_sparse = poly_sparsity_order + 1 + + ! If the one before is real, then we know we're on the first + if (coefficients(i_sparse-1,2) == 0d0) then + output_first_complex = .TRUE. + ! See discussion above + i_sparse = i_sparse + 1 + + ! If the one before is complex + else + + ! Check if the distance between the fixed sparsity root and the one before is > zero + ! If so they must be complex conjugates and hence we are on the second of the pair + if (abs(coefficients(i_sparse,1) - coefficients(i_sparse-1,1))/coefficients(i_sparse,1) > 1e-14 .AND. & + abs(coefficients(i_sparse,2) + coefficients(i_sparse-1,2))/coefficients(i_sparse,2) > 1e-14) then + output_first_complex = .TRUE. + i_sparse = i_sparse + 1 + end if + end if + end if + first_complex = output_first_complex + end if + + ! ~~~~~~~~~~~~ + ! Iterate over the i + ! This is basically the same as the MF application but we have to build the powers + ! ~~~~~~~~~~~~ + i = 1 + ! Loop through to one fewer than the number of roots + ! We're always building up the next product + do while (i .le. i_sparse - 1) + + ! Duplicate & copy the matrix, but ensure there is a diagonal present + ! temp_mat_A is going to store things with the sparsity of A + if (PetscObjectIsNull(temp_mat_A)) then + call mat_duplicate_copy_plus_diag(matrix, .FALSE., temp_mat_A) + else + ! Can reuse the sparsity + call mat_duplicate_copy_plus_diag(matrix, .TRUE., temp_mat_A) + end if + + ! If real this is easy + if (coefficients(i,2) == 0d0) then + + ! Skips eigenvalues that are numerically zero - see + ! the comment in calculate_gmres_polynomial_roots_newton + if (abs(coefficients(i,1)) < 1e-12) then + i = i + 1 + cycle + end if + + ! Then add the scaled version of each product + if (i == 1) then + ! If i == 1 then we know mat_product is identity so we can do it directly + call MatShift(inv_matrix, 1d0/coefficients(i,1), ierr) + else + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(inv_matrix, 1d0/coefficients(i,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(inv_matrix, 1d0/coefficients(i,1), mat_product) + end if + end if + + ! temp_mat_A = A_ff/theta_k + call MatScale(temp_mat_A, -1d0/coefficients(i,1), ierr) + ! temp_mat_A = I - A_ff/theta_k + call MatShift(temp_mat_A, 1d0, ierr) + + ! mat_product_k_plus_1 = mat_product * temp_mat_A + if (i == 1) then + ! If i == 1 then we know mat_product is identity so we can just copy + call MatConvert(temp_mat_A, MATSAME, MAT_INITIAL_MATRIX, mat_product, ierr) + else + call MatMatMult(temp_mat_A, mat_product, & + MAT_INITIAL_MATRIX, 1.5d0, mat_product_k_plus_1, ierr) + call MatDestroy(mat_product, ierr) + mat_product = mat_product_k_plus_1 + end if + + ! We copy out the last product if we're doing this as part of a fixed sparsity multiply + if (output_product .AND. i == i_sparse - 1) then + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_product_output, ierr) + end if + + i = i + 1 + + ! Complex + else + + ! Skips eigenvalues that are numerically zero + if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then + i = i + 2 + cycle + end if + + ! If doing the normal iteration + if (.NOT. first_complex) then + + ! temp_mat_A = -A + call MatScale(temp_mat_A, -1d0, ierr) + ! temp_mat_A = 2a I - A_ff + call MatShift(temp_mat_A, 2d0 * coefficients(i,1), ierr) + + ! temp_mat_A = (2a I - A_ff)/(a^2 + b^2) + call MatScale(temp_mat_A, 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), ierr) + + if (i == 1) then + ! If i == 1 then we know mat_product is identity so we can do it directly + call MatConvert(temp_mat_A, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) + else + ! temp_mat_two = temp_mat_A * mat_product + call MatMatMult(temp_mat_A, mat_product, & + MAT_INITIAL_MATRIX, 1.5d0, temp_mat_two, ierr) + end if + + ! If instead we only have the first of a complex conjugate pair + ! We want to pass out 2 * a * mat_product/(a^2 + b^2) and only add that to inv_matrix + ! This is equivalent to only part of tmp on Line 9 of Loe + ! The fixed sparsity loop will then finish the tmp with the term -A * prod/(a^2+b^2) + ! as this is the part that would increase the sparsity beyond poly_sparsity_order + else + + ! Copy mat_product into temp_mat_two + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) + ! temp_mat_two = 2a * mat_product/(a^2 + b^2) + call MatScale(temp_mat_two, 2d0 * coefficients(i,1)/(coefficients(i,1)**2 + coefficients(i,2)**2), ierr) + + end if + + ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply + if (output_product .AND. i > i_sparse - 2) then + call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_product_output, ierr) + end if + + ! Then add the scaled version of each product + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(inv_matrix, 1d0, temp_mat_two, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(inv_matrix, 1d0, temp_mat_two) + end if + + if (i .le. i_sparse - 2) then + ! temp_mat_three = matrix * temp_mat_two + call MatMatMult(matrix, temp_mat_two, & + MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr) + call MatDestroy(temp_mat_two, ierr) + + ! Then add the scaled version of each product + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(mat_product, -1d0, temp_mat_three, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(mat_product, -1d0, temp_mat_three) + end if + call MatDestroy(temp_mat_three, ierr) + else + call MatDestroy(temp_mat_two, ierr) + end if + + ! Skip two evals + i = i + 2 + + end if + end do + + ! Final step if last root is real + if (.NOT. first_complex) then + if (coefficients(i_sparse,2) == 0d0) then + ! Add in the final term multiplied by 1/theta_poly_order + + ! Skips eigenvalues that are numerically zero + if (abs(coefficients(i,1)) > 1e-12) then + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(inv_matrix, 1d0/coefficients(i,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(inv_matrix, 1d0/coefficients(i,1), mat_product) + end if + end if + end if + end if + + call MatDestroy(temp_mat_A, ierr) + call MatDestroy(mat_product, ierr) + + end subroutine build_gmres_polynomial_newton_inverse_full + ! ------------------------------------------------------------------------------------------------------------------------------- From c163ec45ab7d822804d3419af42d498d493327ec Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 29 Jan 2026 16:29:19 +0000 Subject: [PATCH 09/41] Partially finished higher order sparsity terms. The (r,r,r) and (c,c,r) cases are working, but the (r,c,c) is not yet. --- src/Gmres_Poly_Newton.F90 | 293 +++++++++++++++++++++++++++++--------- 1 file changed, 224 insertions(+), 69 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index f47f437..e52a67d 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -819,29 +819,29 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp PetscInt :: global_col_start, global_col_end_plus_one, n, ncols, ncols_two, ifree, max_nnzs PetscInt :: i_loc, j_loc, row_size, rows_ao, cols_ao, rows_ad, cols_ad, shift = 0 integer :: errorcode, match_counter, term - integer :: comm_size + integer :: comm_size, diag_index PetscErrorCode :: ierr integer, dimension(:), allocatable :: cols_index_one, cols_index_two PetscInt, dimension(:), allocatable :: col_indices_off_proc_array, ad_indices, cols PetscReal, dimension(:), allocatable :: vals type(tIS), dimension(1) :: col_indices, row_indices - type(tMat) :: Ad, Ao + type(tMat) :: Ad, Ao, mat_sparsity_match PetscInt, dimension(:), pointer :: colmap logical :: deallocate_submatrices = .FALSE. type(c_ptr) :: vals_c_ptr - type(tMat), pointer :: mat_sparsity_match type(int_vec), dimension(:), allocatable :: symbolic_ones type(real_vec), dimension(:), allocatable :: symbolic_vals integer(c_long_long) A_array MPIU_Comm :: MPI_COMM_MATRIX - PetscReal, dimension(:), allocatable :: vals_power_temp, vals_previous_power_temp + PetscReal, dimension(:), allocatable :: vals_power_temp, vals_previous_power_temp, temp PetscInt, dimension(:), pointer :: submatrices_ia, submatrices_ja, cols_two_ptr, cols_ptr PetscReal, dimension(:), pointer :: vals_two_ptr, vals_ptr real(c_double), pointer :: submatrices_vals(:) logical :: reuse_triggered PetscBool :: symmetric = PETSC_FALSE, inodecompressed = PETSC_FALSE, done PetscInt, parameter :: one = 1, zero = 0 - logical :: output_first_complex + logical :: output_first_complex, skip_add + PetscReal :: square_sum ! ~~~~~~~~~~ @@ -863,6 +863,8 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp reuse_triggered = .NOT. PetscObjectIsNull(cmat) + print *, "coefficients", coefficients + ! ~~~~~~~~~~ ! Compute cmat for all powers up to poly_sparsity_order ! We have to be more careful here than in the monomial case @@ -887,13 +889,26 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! as this is the part of the product with sparsity up to A ! This is because the prod for complex builds up the A^2 term for the next iteration ! given it does two roots at a time + + ! If we have a real first coefficient and a second complex + ! we can't call build_gmres_polynomial_newton_inverse_1st_1st as it is only correct + ! for valid coefficients up to 1st order (ie both real or both complex) + if (coefficients(1,2) == 0d0 .AND. coefficients(2,2) /= 0d0) then + + print *, "DOING FULL FIRST ORDER BUILD" + + call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & + cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex) + + else - ! Duplicate & copy the matrix, but ensure there is a diagonal present - call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, cmat) + ! Duplicate & copy the matrix, but ensure there is a diagonal present + call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, cmat) - call build_gmres_polynomial_newton_inverse_1st_1st(matrix, one, & - coefficients(1:poly_sparsity_order + 1, 1:2), & - cmat, mat_sparsity_match) + call build_gmres_polynomial_newton_inverse_1st_1st(matrix, one, & + coefficients(1:poly_sparsity_order + 1, 1:2), & + cmat, mat_sparsity_match) + end if else ! If we're any higher, then we build cmat up to that order @@ -1046,6 +1061,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp allocate(vals(max_nnzs)) allocate(vals_power_temp(max_nnzs)) allocate(vals_previous_power_temp(max_nnzs)) + allocate(temp(max_nnzs)) allocate(cols_index_one(max_nnzs)) allocate(cols_index_two(max_nnzs)) @@ -1069,7 +1085,15 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp cols(1:ncols) = cols_ptr(1:ncols) vals(1:ncols) = vals_ptr(1:ncols) call MatRestoreRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, & - cols_ptr, vals_ptr, ierr) + cols_ptr, vals_ptr, ierr) + diag_index = -1 + ! Find the diagonal index in this row + do j_loc = 1, ncols + if (cols(j_loc) == i_loc - 1 + global_row_start) then + diag_index = j_loc + exit + end if + end do ! This is just a symbolic for the set of rows given in cols ! Let's just do all the column matching and extraction of the values once @@ -1145,39 +1169,169 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Start with the values of mat_sparsity_match in it vals_previous_power_temp(1:ncols) = vals(1:ncols) - ! ! Loop over any matrix powers - ! ! vals_power_temp stores the value of A^(term-1) for this row, and we update this as we go through - ! ! the term loop - ! do term = poly_sparsity_order+2, size(coefficients) + ! Loop over any matrix powers + ! vals_power_temp stores the prod for this row, and we update this as we go through + ! the term loop + term = poly_sparsity_order + 1 + skip_add = .FALSE. + ! If the fixed sparsity root is the second of a complex pair, we start one term earlier + ! so that we can compute the correct part of the product, we just make sure not to add + if (coefficients(term,2) /= 0d0 .AND. .NOT. output_first_complex) then + term = term - 1 + skip_add = .TRUE. + print *, "minus one starting term for complex root" + end if + + print *, "starting with term", term + ! This loop skips the last coefficient + do while (term .le. size(coefficients, 1) - 1) + + ! We need to sum up the product of vals_previous_power_temp(j_loc) * matching columns + vals_power_temp(1:ncols) = 0 + + print *, "coeff in term", term, coefficients(term, 1), coefficients(term, 2) + + ! If real + if (coefficients(term,2) == 0d0) then + + print *, "inside real term", term + + ! ~~~~~~~~~~~ + ! Now can add the value to our matrix + ! Can skip this if coeff is zero, but still need to compute A^(term-1) + ! for the next time through + ! Also we skip the first one if we're real as that value has already been added to the + ! matrix by the build_gmres_polynomial_newton_inverse_full (as we had to build the product up + ! to that order) + ! ~~~~~~~~~~~ + if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12 .AND. term > poly_sparsity_order + 1) then + print *, "adding to matrix real term", term + call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & + 1d0/coefficients(term, 1) * vals_previous_power_temp(1:ncols), ADD_VALUES, ierr) + end if + + ! Initialize with previous product before the A*prod subtraction + vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols) + + ! Have to finish all the columns before we move onto the next coefficient + do j_loc = 1, ncols - ! ! We need to sum up the product of vals_previous_power_temp(j_loc) * matching columns - ! vals_power_temp(1:ncols) = 0 + ! If we have no matching columns cycle this row + if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle - ! ! Have to finish all the columns before we move onto the next coefficient - ! do j_loc = 1, ncols + print *, "processing column ", j_loc, " for real term ", term, "with coeff", coefficients(term, 1) - ! ! If we have no matching columns cycle this row - ! if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle + ! symbolic_vals(j_loc)%ptr has the matching values of A in it + ! This is the (I - A_ff/theta_k) * prod + vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) - & + 1d0/coefficients(term, 1) * & + symbolic_vals(j_loc)%ptr * vals_previous_power_temp(j_loc) + end do + + term = term + 1 + + ! If complex + else - ! ! symbolic_vals(j_loc)%ptr has the matching values of A in it - ! vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) + & - ! vals_previous_power_temp(j_loc) * symbolic_vals(j_loc)%ptr + square_sum = 1d0/(coefficients(term,1)**2 + coefficients(term,2)**2) + if (.NOT. skip_add) then + + print *, "NOT SKIP ADD", term, "with output_first_complex", output_first_complex + + ! We skip the 2 * a * prod from the first root of a complex pair if that has already + ! been included in the inv_matrix from build_gmres_polynomial_newton_inverse_full + if (term < poly_sparsity_order + 2) then + if (.NOT. output_first_complex) then + temp(1:ncols) = 2 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) + print *, "not skipping first complex part of product" + else + temp(1:ncols) = 0d0 + print *, "skipping first complex part of product" + end if + else + temp(1:ncols) = 2 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) + print *, "adding 2a term as normal" + end if + + ! This is the -A * prod + do j_loc = 1, ncols + + ! If we have no matching columns cycle this row + if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle + + ! symbolic_vals(j_loc)%ptr has the matching values of A in it + temp(symbolic_ones(j_loc)%ptr) = temp(symbolic_ones(j_loc)%ptr) - & + symbolic_vals(j_loc)%ptr * vals_previous_power_temp(j_loc) + end do + + ! This is the p = p + 1/(a^2 + b^2) * temp + if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12) then + call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & + square_sum * temp(1:ncols), ADD_VALUES, ierr) + end if + + ! for (r, c, c) + ! problem here is 2 *a * prod has been added to inv_matrix but we need to have added + ! 2aprod/a^2+b^2 + ! for (c, c, r) mat product is output without the 1/a^2+b^2 but that is fine as we + ! compensate for that in the product - ! end do - ! ! ~~~~~~~~~~~ - ! ! Now can add the value of coeff * A^(term-1) to our matrix - ! ! Can skip this if coeff is zero, but still need to compute A^(term-1) - ! ! for the next time through - ! ! ~~~~~~~~~~~ - ! if (ncols /= 0 .AND. coefficients(term) /= 0d0) then - ! call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & - ! coefficients(term) * vals_power_temp, ADD_VALUES, ierr) - ! end if - - ! ! This should now have the value of A^(term-1) in it - ! vals_previous_power_temp(1:ncols) = vals_power_temp(1:ncols) - ! end do + ! First time through complex pair + else + + ! If we're skipping the add, then vals_previous_power_temp has all the correct + ! values in it for temp + ! All we have to do is compute prod for the next time through + skip_add = .FALSE. + print *, "SKIP ADD" + temp(1:ncols) = vals_previous_power_temp(1:ncols) + ! @@@ have to be careful here! + ! If we've gone back a term, we don't have anything in prod + ! prod is I when term = 1 + if (term == 1) then + vals_previous_power_temp(1:ncols) = 0d0 + if (diag_index /= -1) then + vals_previous_power_temp(diag_index) = 1d0 + end if + end if + end if + + if (term .le. size(coefficients, 1)- 2) then + + print *, "COMPUTING PRODUCT COMPLEX" + + vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols) + + ! This is prod = prod - 1/(a^2 + b^2) * A * temp + do j_loc = 1, ncols + + ! If we have no matching columns cycle this row + if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle + + ! symbolic_vals(j_loc)%ptr has the matching values of A in it + vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) - & + square_sum * & + symbolic_vals(j_loc)%ptr * temp(j_loc) + end do + end if + + term = term + 2 + + end if + + ! This should now have the value of A^(term-1) in it + vals_previous_power_temp(1:ncols) = vals_power_temp(1:ncols) + end do + + ! Final step if last root is real + if (coefficients(term,2) == 0d0) then + if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12) then + print *, "adding to matrix FINAL real term", term, coefficients(term, 1) + call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & + 1d0/coefficients(term, 1) * vals_power_temp(1:ncols), ADD_VALUES, ierr) + end if + end if ! Delete our symbolic do j_loc = 1, ncols @@ -1215,7 +1369,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp end if deallocate(col_indices_off_proc_array) - deallocate(cols, vals, vals_power_temp, vals_previous_power_temp, cols_index_one, cols_index_two) + deallocate(cols, vals, vals_power_temp, vals_previous_power_temp, temp, cols_index_one, cols_index_two) ! Finish assembly call MatAssemblyEnd(cmat, MAT_FINAL_ASSEMBLY, ierr) @@ -1547,7 +1701,7 @@ end subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton ! ------------------------------------------------------------------------------------------------------------------------------- subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coefficients, & - inv_matrix, mat_product_output) + inv_matrix, mat_prod_or_temp) ! Specific 1st order with 1st order sparsity @@ -1556,7 +1710,7 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe integer, intent(in) :: poly_order PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients type(tMat), intent(inout) :: inv_matrix - type(tMat), intent(inout), optional :: mat_product_output + type(tMat), intent(inout), optional :: mat_prod_or_temp ! Local variables PetscErrorCode :: ierr @@ -1566,13 +1720,15 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe ! ~~~~~~ reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) - output_product = present(mat_product_output) + output_product = present(mat_prod_or_temp) ! Flags to prevent reductions when assembling (there are assembles in the shift) call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) call MatSetOption(inv_matrix, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) call MatSetOption(inv_matrix, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) + print *, "inside 1st 1st", coefficients + ! We only have two coefficients, so they are either both real or complex conjugates ! If real if (coefficients(1,2) == 0d0) then @@ -1606,9 +1762,9 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe ! result = I -A_ff/theta_1 call MatShift(inv_matrix, 1d0, ierr) ! If we're doing this as part of fixed sparsity multiply, - ! we need to return mat_product_output + ! we need to return mat_prod_or_temp if (output_product) then - call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_product_output, ierr) + call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if ! result = 1/theta_2 * (I -A_ff/theta_1) @@ -1624,16 +1780,18 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe square_sum = coefficients(1,1)**2 + coefficients(1,2)**2 ! Complex conjugate roots - ! result = -A_ff / (a^2 + b^2) - call MatScale(inv_matrix, -1d0/square_sum, ierr) - ! result = 2a/(a^2 + b^2) I - A_ff / (a^2 + b^2) + ! result = -A_ff + call MatScale(inv_matrix, -1d0, ierr) + ! result = 2a I - A_ff ! Don't need an assemble as there is one called in this - call MatShift(inv_matrix, 2d0 * coefficients(1,1)/square_sum, ierr) + call MatShift(inv_matrix, 2d0 * coefficients(1,1), ierr) ! If we're doing this as part of fixed sparsity multiply, - ! we need to return mat_product_output + ! we need to return mat_prod_or_temp if (output_product) then - call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_product_output, ierr) - end if + call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + end if + ! result = 2a I - A_ff/(a^2 + b^2) + call MatScale(inv_matrix, 1d0/square_sum, ierr) end if end subroutine build_gmres_polynomial_newton_inverse_1st_1st @@ -1642,19 +1800,19 @@ end subroutine build_gmres_polynomial_newton_inverse_1st_1st ! ------------------------------------------------------------------------------------------------------------------------------- subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & - inv_matrix, mat_product_output, poly_sparsity_order, output_first_complex) + inv_matrix, mat_prod_or_temp, poly_sparsity_order, output_first_complex) ! No constrained sparsity by default - ! If you pass in mat_product_output, poly_sparsity_order, output_first_complex + ! If you pass in mat_prod_or_temp, poly_sparsity_order, output_first_complex ! then it will build part of the terms, up to poly_sparsity_order, and return the product - ! in mat_product_output that you need to compute the rest of the fixed sparsity terms + ! in mat_prod_or_temp that you need to compute the rest of the fixed sparsity terms ! ~~~~~~ type(tMat), intent(in) :: matrix integer, intent(in) :: poly_order PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients type(tMat), intent(inout) :: inv_matrix - type(tMat), intent(inout), optional :: mat_product_output + type(tMat), intent(inout), optional :: mat_prod_or_temp integer, intent(in), optional :: poly_sparsity_order logical, intent(inout), optional :: output_first_complex @@ -1667,7 +1825,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! ~~~~~~ reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) - output_product = present(mat_product_output) + output_product = present(mat_prod_or_temp) if (.NOT. reuse_triggered) then ! Duplicate & copy the matrix, but ensure there is a diagonal present @@ -1802,7 +1960,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i == i_sparse - 1) then - call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_product_output, ierr) + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if i = i + 1 @@ -1824,9 +1982,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! temp_mat_A = 2a I - A_ff call MatShift(temp_mat_A, 2d0 * coefficients(i,1), ierr) - ! temp_mat_A = (2a I - A_ff)/(a^2 + b^2) - call MatScale(temp_mat_A, 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), ierr) - if (i == 1) then ! If i == 1 then we know mat_product is identity so we can do it directly call MatConvert(temp_mat_A, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) @@ -1837,31 +1992,30 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi end if ! If instead we only have the first of a complex conjugate pair - ! We want to pass out 2 * a * mat_product/(a^2 + b^2) and only add that to inv_matrix + ! We want to pass out mat_product and only add that to inv_matrix ! This is equivalent to only part of tmp on Line 9 of Loe ! The fixed sparsity loop will then finish the tmp with the term -A * prod/(a^2+b^2) ! as this is the part that would increase the sparsity beyond poly_sparsity_order else ! Copy mat_product into temp_mat_two - call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) - ! temp_mat_two = 2a * mat_product/(a^2 + b^2) - call MatScale(temp_mat_two, 2d0 * coefficients(i,1)/(coefficients(i,1)**2 + coefficients(i,2)**2), ierr) + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) end if ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i > i_sparse - 2) then - call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_product_output, ierr) + call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if ! Then add the scaled version of each product if (reuse_triggered) then ! If doing reuse we know our nonzeros are a subset - call MatAXPY(inv_matrix, 1d0, temp_mat_two, SUBSET_NONZERO_PATTERN, ierr) + call MatAXPY(inv_matrix, 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), & + temp_mat_two, SUBSET_NONZERO_PATTERN, ierr) else ! Have to use the DIFFERENT_NONZERO_PATTERN here - call MatAXPYWrapper(inv_matrix, 1d0, temp_mat_two) + call MatAXPYWrapper(inv_matrix, 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), temp_mat_two) end if if (i .le. i_sparse - 2) then @@ -1873,10 +2027,11 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Then add the scaled version of each product if (reuse_triggered) then ! If doing reuse we know our nonzeros are a subset - call MatAXPY(mat_product, -1d0, temp_mat_three, SUBSET_NONZERO_PATTERN, ierr) + call MatAXPY(mat_product, -1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), & + temp_mat_three, SUBSET_NONZERO_PATTERN, ierr) else ! Have to use the DIFFERENT_NONZERO_PATTERN here - call MatAXPYWrapper(mat_product, -1d0, temp_mat_three) + call MatAXPYWrapper(mat_product, -1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), temp_mat_three) end if call MatDestroy(temp_mat_three, ierr) else From fcbed5451c821e2a3e90867713bfbc36187c320f Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 29 Jan 2026 22:03:31 +0000 Subject: [PATCH 10/41] High order terms with first order sparsity are now correct --- src/Gmres_Poly_Newton.F90 | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index e52a67d..7f604a6 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -1275,8 +1275,13 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! 2aprod/a^2+b^2 ! for (c, c, r) mat product is output without the 1/a^2+b^2 but that is fine as we ! compensate for that in the product + if (term < poly_sparsity_order + 2) then + if (output_first_complex) then + temp(1:ncols) = temp(1:ncols) + 2d0 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) + print *, "ADDING 2*a*prod back into temp" + end if + end if - ! First time through complex pair else @@ -1968,6 +1973,9 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Complex else + + print *, "INTO FULL", "first_complex", first_complex + ! Skips eigenvalues that are numerically zero if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then i = i + 2 @@ -1977,6 +1985,8 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! If doing the normal iteration if (.NOT. first_complex) then + print *, "adding in 2a prod - A prod" + ! temp_mat_A = -A call MatScale(temp_mat_A, -1d0, ierr) ! temp_mat_A = 2a I - A_ff @@ -1989,6 +1999,11 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! temp_mat_two = temp_mat_A * mat_product call MatMatMult(temp_mat_A, mat_product, & MAT_INITIAL_MATRIX, 1.5d0, temp_mat_two, ierr) + end if + + ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply + if (output_product .AND. i > i_sparse - 2) then + call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if ! If instead we only have the first of a complex conjugate pair @@ -1998,14 +2013,19 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! as this is the part that would increase the sparsity beyond poly_sparsity_order else + print *, "only first complex - passing out product" + ! Copy mat_product into temp_mat_two call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) - end if + ! temp_mat_two = 2a * mat_product + call MatScale(temp_mat_two, 2d0 * coefficients(i,1), ierr) + + ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply + if (output_product .AND. i > i_sparse - 2) then + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + end if - ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply - if (output_product .AND. i > i_sparse - 2) then - call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if ! Then add the scaled version of each product @@ -2019,6 +2039,9 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi end if if (i .le. i_sparse - 2) then + + print *, "COMPUTING PRODUCT IN FULL" + ! temp_mat_three = matrix * temp_mat_two call MatMatMult(matrix, temp_mat_two, & MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr) From f370c9d4f749489688403067bd9dbd981b552a39 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 29 Jan 2026 23:49:28 +0000 Subject: [PATCH 11/41] Add in fixed sparsity order 1 assembled newton tests. The tests hit the three cases with real and complex eigenvalues, namely (r,r,r), (c,c,r) and (r,c,c) cases. --- tests/Makefile | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/tests/Makefile b/tests/Makefile index 514ae5e..888cecf 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -121,7 +121,12 @@ run_tests_load_serial: @echo "" @echo "Test AIRG with GMRES polynomials in indefinite problem with zero diagonals" ./ex6 -f data/e05r0100_petsc -b_in_f 0 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 26 - +# + @echo "" + @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 2nd order" + ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 2 + @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 3rd order" + ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 3 # ~~~~~~~~~~~ # ~~~~~~~~~~~ run_tests_load_parallel: @@ -156,6 +161,12 @@ run_tests_load_parallel: @echo "" @echo "Test AIRG with GMRES polynomials in indefinite problem with zero diagonals in parallel" $(MPIEXEC) -n 2 ./ex6 -f data/e05r0100_petsc -b_in_f 0 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 26 +# + @echo "" + @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 2nd order in parallel" + $(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 2 + @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 3rd order in parallel" + $(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 3 # ~~~~~~~~~~~ # ~~~~~~~~~~~ @@ -278,8 +289,8 @@ run_tests_no_load_serial: ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC regenerated with no sparsity change" ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 - @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change" - ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 +# @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change" +# ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 # @echo "" @echo "Test AIRG Newton with GMRES polynomials with PC reused with no sparsity change with 0th order fixed sparsity" @@ -534,9 +545,9 @@ run_tests_no_load_parallel: @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC regenerated with no sparsity change in parallel" $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 \ -pc_air_a_drop 1e-3 -pc_air_inverse_type newton - @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change in parallel" - $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 \ - -pc_air_a_drop 1e-3 -pc_air_inverse_type newton +# @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change in parallel" +# $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 \ +# -pc_air_a_drop 1e-3 -pc_air_inverse_type newton # @echo "" @echo "Test AIRG Newton with GMRES polynomials with PC reused with no sparsity change in parallel with 0th order fixed sparsity" From ec58033128f59fed32125d304f5dfc1e2337d90d Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 30 Jan 2026 01:21:36 +0000 Subject: [PATCH 12/41] Add test that checks the difference between residuals for different types of GMRES polynomial --- Makefile | 2 +- src/Gmres_Poly_Newton.F90 | 37 -------- tests/Makefile | 25 ++++++ tests/ex12f_gmres_poly.F90 | 173 +++++++++++++++++++++++++++++++++++++ 4 files changed, 199 insertions(+), 38 deletions(-) create mode 100644 tests/ex12f_gmres_poly.F90 diff --git a/Makefile b/Makefile index 3fcd41a..87fe1a6 100644 --- a/Makefile +++ b/Makefile @@ -138,7 +138,7 @@ OBJS := $(OBJS) $(SRCDIR)/PETSc_Helper.o \ $(SRCDIR)/PCPFLAREINV.o # Define a variable containing all the tests -export TEST_TARGETS = ex12f ex6f ex6f_getcoeffs ex6 adv_1d adv_diff_2d ex6_cf_splitting adv_diff_cg_supg matrandom matrandom_check_reset +export TEST_TARGETS = ex12f ex6f ex6f_getcoeffs ex6 adv_1d adv_diff_2d ex6_cf_splitting adv_diff_cg_supg matrandom matrandom_check_reset ex12f_gmres_poly # Include kokkos examples ifeq ($(PETSC_HAVE_KOKKOS),1) export TEST_TARGETS := $(TEST_TARGETS) adv_1dk diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 7f604a6..5af813c 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -863,8 +863,6 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp reuse_triggered = .NOT. PetscObjectIsNull(cmat) - print *, "coefficients", coefficients - ! ~~~~~~~~~~ ! Compute cmat for all powers up to poly_sparsity_order ! We have to be more careful here than in the monomial case @@ -895,8 +893,6 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! for valid coefficients up to 1st order (ie both real or both complex) if (coefficients(1,2) == 0d0 .AND. coefficients(2,2) /= 0d0) then - print *, "DOING FULL FIRST ORDER BUILD" - call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex) @@ -1179,23 +1175,17 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp if (coefficients(term,2) /= 0d0 .AND. .NOT. output_first_complex) then term = term - 1 skip_add = .TRUE. - print *, "minus one starting term for complex root" end if - print *, "starting with term", term ! This loop skips the last coefficient do while (term .le. size(coefficients, 1) - 1) ! We need to sum up the product of vals_previous_power_temp(j_loc) * matching columns vals_power_temp(1:ncols) = 0 - print *, "coeff in term", term, coefficients(term, 1), coefficients(term, 2) - ! If real if (coefficients(term,2) == 0d0) then - print *, "inside real term", term - ! ~~~~~~~~~~~ ! Now can add the value to our matrix ! Can skip this if coeff is zero, but still need to compute A^(term-1) @@ -1205,7 +1195,6 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! to that order) ! ~~~~~~~~~~~ if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12 .AND. term > poly_sparsity_order + 1) then - print *, "adding to matrix real term", term call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & 1d0/coefficients(term, 1) * vals_previous_power_temp(1:ncols), ADD_VALUES, ierr) end if @@ -1219,8 +1208,6 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! If we have no matching columns cycle this row if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle - print *, "processing column ", j_loc, " for real term ", term, "with coeff", coefficients(term, 1) - ! symbolic_vals(j_loc)%ptr has the matching values of A in it ! This is the (I - A_ff/theta_k) * prod vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) - & @@ -1236,21 +1223,16 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp square_sum = 1d0/(coefficients(term,1)**2 + coefficients(term,2)**2) if (.NOT. skip_add) then - print *, "NOT SKIP ADD", term, "with output_first_complex", output_first_complex - ! We skip the 2 * a * prod from the first root of a complex pair if that has already ! been included in the inv_matrix from build_gmres_polynomial_newton_inverse_full if (term < poly_sparsity_order + 2) then if (.NOT. output_first_complex) then temp(1:ncols) = 2 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) - print *, "not skipping first complex part of product" else temp(1:ncols) = 0d0 - print *, "skipping first complex part of product" end if else temp(1:ncols) = 2 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) - print *, "adding 2a term as normal" end if ! This is the -A * prod @@ -1278,7 +1260,6 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp if (term < poly_sparsity_order + 2) then if (output_first_complex) then temp(1:ncols) = temp(1:ncols) + 2d0 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) - print *, "ADDING 2*a*prod back into temp" end if end if @@ -1289,7 +1270,6 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! values in it for temp ! All we have to do is compute prod for the next time through skip_add = .FALSE. - print *, "SKIP ADD" temp(1:ncols) = vals_previous_power_temp(1:ncols) ! @@@ have to be careful here! ! If we've gone back a term, we don't have anything in prod @@ -1304,8 +1284,6 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp if (term .le. size(coefficients, 1)- 2) then - print *, "COMPUTING PRODUCT COMPLEX" - vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols) ! This is prod = prod - 1/(a^2 + b^2) * A * temp @@ -1332,7 +1310,6 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Final step if last root is real if (coefficients(term,2) == 0d0) then if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12) then - print *, "adding to matrix FINAL real term", term, coefficients(term, 1) call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & 1d0/coefficients(term, 1) * vals_power_temp(1:ncols), ADD_VALUES, ierr) end if @@ -1408,9 +1385,6 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & PetscErrorCode :: ierr MPIU_Comm :: MPI_COMM_MATRIX type(mat_ctxtype), pointer :: mat_ctx=>null() - logical :: reuse_triggered - PetscReal :: square_sum - type(tMat) :: mat_product, temp_mat_A, temp_mat_two, temp_mat_three, mat_product_k_plus_1 logical :: reuse_triggered ! ~~~~~~ @@ -1732,8 +1706,6 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe call MatSetOption(inv_matrix, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) call MatSetOption(inv_matrix, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) - print *, "inside 1st 1st", coefficients - ! We only have two coefficients, so they are either both real or complex conjugates ! If real if (coefficients(1,2) == 0d0) then @@ -1973,9 +1945,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Complex else - - print *, "INTO FULL", "first_complex", first_complex - ! Skips eigenvalues that are numerically zero if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then i = i + 2 @@ -1985,8 +1954,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! If doing the normal iteration if (.NOT. first_complex) then - print *, "adding in 2a prod - A prod" - ! temp_mat_A = -A call MatScale(temp_mat_A, -1d0, ierr) ! temp_mat_A = 2a I - A_ff @@ -2013,8 +1980,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! as this is the part that would increase the sparsity beyond poly_sparsity_order else - print *, "only first complex - passing out product" - ! Copy mat_product into temp_mat_two call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) @@ -2040,8 +2005,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi if (i .le. i_sparse - 2) then - print *, "COMPUTING PRODUCT IN FULL" - ! temp_mat_three = matrix * temp_mat_two call MatMatMult(matrix, temp_mat_two, & MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr) diff --git a/tests/Makefile b/tests/Makefile index 888cecf..5a5081f 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -127,6 +127,19 @@ run_tests_load_serial: ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 2 @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 3rd order" ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 3 +# + @echo "" + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders" + @for order in 0 1 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \ + done + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders first order fixed sparsity" + @for order in 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order; \ + done + # ~~~~~~~~~~~ # ~~~~~~~~~~~ run_tests_load_parallel: @@ -167,6 +180,18 @@ run_tests_load_parallel: $(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 2 @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 3rd order in parallel" $(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 3 +# + @echo "" + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders in parallel" + @for order in 0 1 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \ + done + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders first order fixed sparsity in parallel" + @for order in 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order; \ + done # ~~~~~~~~~~~ # ~~~~~~~~~~~ diff --git a/tests/ex12f_gmres_poly.F90 b/tests/ex12f_gmres_poly.F90 new file mode 100644 index 0000000..bdbf8ce --- /dev/null +++ b/tests/ex12f_gmres_poly.F90 @@ -0,0 +1,173 @@ +! + program main +#include + use petscksp +#include "finclude/pflare.h" + implicit none + +! Comparison between different forms of GMRES polynomials + + PetscErrorCode ierr + PetscInt m,n,mlocal,nlocal + PetscBool flg + PetscReal norm_power, norm_rhs, norm_arnoldi, norm_newton + PetscReal :: norm_diff_one, norm_diff_two + Vec x,b,u, b_diff_type + Mat A, A_diff_type + character*(128) f + PetscViewer fd + KSP ksp + PC pc + KSPConvergedReason reason + PetscInt, parameter :: one=1 + MatType :: mtype, mtype_input + + call PetscInitialize(PETSC_NULL_CHARACTER,ierr) + if (ierr .ne. 0) then + print*,'Unable to initialize PETSc' + stop + endif + +! Read in matrix and RHS + call PetscOptionsGetString(PETSC_NULL_OPTIONS, & + & PETSC_NULL_CHARACTER,'-f',f,flg,ierr) + call PetscViewerBinaryOpen(PETSC_COMM_WORLD,f,FILE_MODE_READ, & + & fd,ierr) + + call MatCreate(PETSC_COMM_WORLD,A,ierr) + call MatLoad(A,fd,ierr) + + ! Get information about matrix + call MatGetSize(A,m,n,ierr) + call MatGetLocalSize(A,mlocal,nlocal,ierr) + + call VecCreate(PETSC_COMM_WORLD,b,ierr) + call VecLoad(b,fd,ierr) + call PetscViewerDestroy(fd,ierr) + + ! Test and see if the user wants us to use a different matrix type + ! with -mat_type on the command line + ! This lets us easily test our cpu and kokkos versions through our CI + call MatCreateFromOptions(PETSC_COMM_WORLD,PETSC_NULL_CHARACTER,& + one,mlocal,nlocal,m,n,A_diff_type,ierr) + call MatAssemblyBegin(A_diff_type,MAT_FINAL_ASSEMBLY,ierr) + call MatAssemblyEnd(A_diff_type,MAT_FINAL_ASSEMBLY,ierr) + + call MatGetType(A, mtype, ierr) + call MatGetType(A_diff_type, mtype_input, ierr) + + if (mtype /= mtype_input) then + ! Doesn't seem like there is a converter to kokkos + ! So instead we just copy into the empty A_diff_type + ! This will be slow as its not preallocated, but this is just for testing + call MatCopy(A, A_diff_type, DIFFERENT_NONZERO_PATTERN, ierr) + call MatDestroy(A,ierr) + A = A_diff_type + + ! Mat and vec types have to match + call VecCreateFromOptions(PETSC_COMM_WORLD,PETSC_NULL_CHARACTER, & + one,nlocal,n,b_diff_type,ierr) + call VecCopy(b,b_diff_type,ierr) + call VecDestroy(b,ierr) + b = b_diff_type + + else + call MatDestroy(A_diff_type,ierr) + end if + + ! Set up solution + call VecDuplicate(b,x,ierr) + call VecDuplicate(b,u,ierr) + + ! Register the pflare types + call PCRegister_PFLARE() + + call VecNorm(b,NORM_2,norm_rhs,ierr) + + ! ~~~~~~~~~~~~~ + ! Do a solve with the power basis + ! ~~~~~~~~~~~~~ + call KSPCreate(PETSC_COMM_WORLD,ksp,ierr) + call KSPSetOperators(ksp,A,A,ierr) + call KSPGetPC(ksp, pc, ierr) + call PCSetType(pc, PCAIR, ierr) + call PCAIRSetInverseType(pc, PFLAREINV_POWER, ierr) + call KSPSetPC(ksp, pc, ierr) + call KSPSetFromOptions(ksp,ierr) + + call VecSet(x, 0d0, ierr) + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp,reason,ierr) + if (reason%v < 0) then + error stop 1 + end if + ! Compute the residual + call MatMult(A,x,u,ierr) + call VecAXPY(u,-1d0,b,ierr) + call VecNorm(u,NORM_2,norm_power,ierr) + norm_power = norm_power/norm_rhs + + ! ~~~~~~~~~~~~~ + ! Now do a solve with the Arnoldi basis + ! ~~~~~~~~~~~~~ + call PCAIRSetInverseType(pc, PFLAREINV_ARNOLDI, ierr) + + call VecSet(x, 0d0, ierr) + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp,reason,ierr) + if (reason%v < 0) then + error stop 1 + end if + ! Compute the residual + call MatMult(A,x,u,ierr) + call VecAXPY(u,-1d0,b,ierr) + call VecNorm(u,NORM_2,norm_arnoldi,ierr) + norm_arnoldi = norm_arnoldi/norm_rhs + + ! ~~~~~~~~~~~~~ + ! Now do a solve with the Newton basis + ! ~~~~~~~~~~~~~ + call PCAIRSetInverseType(pc, PFLAREINV_NEWTON, ierr) + + call VecSet(x, 0d0, ierr) + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp,reason,ierr) + if (reason%v < 0) then + error stop 1 + end if + ! Compute the residual + call MatMult(A,x,u,ierr) + call VecAXPY(u,-1d0,b,ierr) + call VecNorm(u,NORM_2,norm_newton,ierr) + norm_newton = norm_newton/norm_rhs + call KSPDestroy(ksp,ierr) + + ! ~~~~~~~~~~~~~ + ! Now check all the residuals are the same + ! For low order polynomials on the diagonally dominant + ! A_ff on each level they should be basically identical and hence + ! we should have almost no difference in the resulting residual + ! ~~~~~~~~~~~~~ + norm_diff_one = abs(norm_power - norm_newton)/norm_newton + if (norm_diff_one > 1e-9) then + print *, "Residuals differ between polynomial bases!", norm_diff_one + print *, "Power basis residual: ", norm_power + print *, "Newton basis residual: ", norm_newton + error stop 1 + end if + norm_diff_two = abs(norm_arnoldi - norm_power)/norm_power + if (norm_diff_two > 1e-9) then + print *, "Residuals differ between polynomial bases!", norm_diff_two + print *, "Arnoldi basis residual: ", norm_arnoldi + print *, "Newton basis residual: ", norm_newton + error stop 1 + end if + + call VecDestroy(b,ierr) + call VecDestroy(x,ierr) + call VecDestroy(u,ierr) + call MatDestroy(A,ierr) + + call PetscFinalize(ierr) + + end \ No newline at end of file From 6c0f56bd58d2a2eeca56759bccdf234b143d9f3c Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 5 Feb 2026 14:47:02 +0000 Subject: [PATCH 13/41] Higher order terms are working for newton assembly. Still more testing required --- src/Gmres_Poly_Newton.F90 | 251 +++++++++++++++++++++----------------- 1 file changed, 142 insertions(+), 109 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 5af813c..2876053 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -684,18 +684,18 @@ subroutine mat_mult_powers_share_sparsity_newton(matrix, poly_order, poly_sparsi type(tMat), intent(inout) :: reuse_mat, cmat type(tMat), dimension(:), pointer, intent(inout) :: reuse_submatrices -#if defined(PETSC_HAVE_KOKKOS) - integer(c_long_long) :: A_array, B_array, reuse_array - integer :: errorcode, reuse_int_cmat, reuse_int_reuse_mat - PetscErrorCode :: ierr - MatType :: mat_type - Mat :: temp_mat, temp_mat_reuse, temp_mat_compare - PetscScalar normy; - logical :: reuse_triggered_cmat, reuse_triggered_reuse_mat - type(c_ptr) :: coefficients_ptr - type(tMat) :: reuse_mat_cpu - type(tMat), dimension(:), pointer :: reuse_submatrices_cpu -#endif +! #if defined(PETSC_HAVE_KOKKOS) +! integer(c_long_long) :: A_array, B_array, reuse_array +! integer :: errorcode, reuse_int_cmat, reuse_int_reuse_mat +! PetscErrorCode :: ierr +! MatType :: mat_type +! Mat :: temp_mat, temp_mat_reuse, temp_mat_compare +! PetscScalar normy; +! logical :: reuse_triggered_cmat, reuse_triggered_reuse_mat +! type(c_ptr) :: coefficients_ptr +! type(tMat) :: reuse_mat_cpu +! type(tMat), dimension(:), pointer :: reuse_submatrices_cpu +! #endif ! ~~~~~~~~~~ ! ~~~~~~~~~~ @@ -710,90 +710,90 @@ subroutine mat_mult_powers_share_sparsity_newton(matrix, poly_order, poly_sparsi return end if -#if defined(PETSC_HAVE_KOKKOS) - - call MatGetType(matrix, mat_type, ierr) - if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & - mat_type == MATAIJKOKKOS) then - - A_array = matrix%v - reuse_triggered_cmat = .NOT. PetscObjectIsNull(cmat) - reuse_triggered_reuse_mat = .NOT. PetscObjectIsNull(reuse_mat) - reuse_int_cmat = 0 - if (reuse_triggered_cmat) then - reuse_int_cmat = 1 - B_array = cmat%v - end if - reuse_int_reuse_mat = 0 - if (reuse_triggered_reuse_mat) then - reuse_int_reuse_mat = 1 - end if - reuse_array = reuse_mat%v - coefficients_ptr = c_loc(coefficients) - - ! call mat_mult_powers_share_sparsity_newton_kokkos(A_array, poly_order, poly_sparsity_order, & - ! coefficients_ptr, reuse_int_reuse_mat, reuse_array, reuse_int_cmat, B_array) +! #if defined(PETSC_HAVE_KOKKOS) + +! call MatGetType(matrix, mat_type, ierr) +! if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & +! mat_type == MATAIJKOKKOS) then + +! A_array = matrix%v +! reuse_triggered_cmat = .NOT. PetscObjectIsNull(cmat) +! reuse_triggered_reuse_mat = .NOT. PetscObjectIsNull(reuse_mat) +! reuse_int_cmat = 0 +! if (reuse_triggered_cmat) then +! reuse_int_cmat = 1 +! B_array = cmat%v +! end if +! reuse_int_reuse_mat = 0 +! if (reuse_triggered_reuse_mat) then +! reuse_int_reuse_mat = 1 +! end if +! reuse_array = reuse_mat%v +! coefficients_ptr = c_loc(coefficients) + +! ! call mat_mult_powers_share_sparsity_newton_kokkos(A_array, poly_order, poly_sparsity_order, & +! ! coefficients_ptr, reuse_int_reuse_mat, reuse_array, reuse_int_cmat, B_array) - reuse_mat%v = reuse_array - cmat%v = B_array - - ! If debugging do a comparison between CPU and Kokkos results - if (kokkos_debug()) then - - ! If we're doing reuse and debug, then we have to always output the result - ! from the cpu version, as it will have coo preallocation structures set - ! They aren't copied over if you do a matcopy (or matconvert) - ! If we didn't do that the next time we come through this routine - ! and try to call the cpu version with reuse, it will segfault - if (reuse_triggered_cmat) then - temp_mat = cmat - call MatConvert(cmat, MATSAME, MAT_INITIAL_MATRIX, temp_mat_compare, ierr) - else - temp_mat_compare = cmat - end if - - ! Debug check if the CPU and Kokkos versions are the same - ! We send in an empty reuse_mat_cpu here always, as we can't pass through - ! the same one Kokkos uses as it now only gets out the non-local rows we need - ! (ie reuse_mat and reuse_mat_cpu are no longer the same size) - reuse_submatrices_cpu => null() - call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & - coefficients, reuse_mat_cpu, reuse_submatrices_cpu, temp_mat) - call destroy_matrix_reuse(reuse_mat_cpu, reuse_submatrices_cpu) +! reuse_mat%v = reuse_array +! cmat%v = B_array + +! ! If debugging do a comparison between CPU and Kokkos results +! if (kokkos_debug()) then + +! ! If we're doing reuse and debug, then we have to always output the result +! ! from the cpu version, as it will have coo preallocation structures set +! ! They aren't copied over if you do a matcopy (or matconvert) +! ! If we didn't do that the next time we come through this routine +! ! and try to call the cpu version with reuse, it will segfault +! if (reuse_triggered_cmat) then +! temp_mat = cmat +! call MatConvert(cmat, MATSAME, MAT_INITIAL_MATRIX, temp_mat_compare, ierr) +! else +! temp_mat_compare = cmat +! end if + +! ! Debug check if the CPU and Kokkos versions are the same +! ! We send in an empty reuse_mat_cpu here always, as we can't pass through +! ! the same one Kokkos uses as it now only gets out the non-local rows we need +! ! (ie reuse_mat and reuse_mat_cpu are no longer the same size) +! reuse_submatrices_cpu => null() +! call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & +! coefficients, reuse_mat_cpu, reuse_submatrices_cpu, temp_mat) +! call destroy_matrix_reuse(reuse_mat_cpu, reuse_submatrices_cpu) - call MatConvert(temp_mat, MATSAME, MAT_INITIAL_MATRIX, & - temp_mat_reuse, ierr) - - call MatAXPYWrapper(temp_mat_reuse, -1d0, temp_mat_compare) - call MatNorm(temp_mat_reuse, NORM_FROBENIUS, normy, ierr) - ! There is floating point compute in these inverses, so we have to be a - ! bit more tolerant to rounding differences - if (normy .gt. 1d-11 .OR. normy/=normy) then - !call MatFilter(temp_mat_reuse, 1d-14, PETSC_TRUE, PETSC_FALSE, ierr) - !call MatView(temp_mat_reuse, PETSC_VIEWER_STDOUT_WORLD, ierr) - print *, "Kokkos and CPU versions of mat_mult_powers_share_sparsity do not match" - - call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) - end if - call MatDestroy(temp_mat_reuse, ierr) - if (.NOT. reuse_triggered_cmat) then - call MatDestroy(cmat, ierr) - else - call MatDestroy(temp_mat_compare, ierr) - end if - cmat = temp_mat - end if - - else - - call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & - coefficients, reuse_mat, reuse_submatrices, cmat) - - end if -#else +! call MatConvert(temp_mat, MATSAME, MAT_INITIAL_MATRIX, & +! temp_mat_reuse, ierr) + +! call MatAXPYWrapper(temp_mat_reuse, -1d0, temp_mat_compare) +! call MatNorm(temp_mat_reuse, NORM_FROBENIUS, normy, ierr) +! ! There is floating point compute in these inverses, so we have to be a +! ! bit more tolerant to rounding differences +! if (normy .gt. 1d-11 .OR. normy/=normy) then +! !call MatFilter(temp_mat_reuse, 1d-14, PETSC_TRUE, PETSC_FALSE, ierr) +! !call MatView(temp_mat_reuse, PETSC_VIEWER_STDOUT_WORLD, ierr) +! print *, "Kokkos and CPU versions of mat_mult_powers_share_sparsity do not match" + +! call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) +! end if +! call MatDestroy(temp_mat_reuse, ierr) +! if (.NOT. reuse_triggered_cmat) then +! call MatDestroy(cmat, ierr) +! else +! call MatDestroy(temp_mat_compare, ierr) +! end if +! cmat = temp_mat +! end if + +! else + +! call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & +! coefficients, reuse_mat, reuse_submatrices, cmat) + +! end if +! #else call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & coefficients, reuse_mat, reuse_submatrices, cmat) -#endif +!#endif ! ~~~~~~~~~~ @@ -907,6 +907,9 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp end if else + print *,"reals", coefficients(:,1) + print *,"imags", coefficients(:,2) + ! If we're any higher, then we build cmat up to that order ! But we have to be careful because the last root we want to explicitly ! build up to here (ie the power of the matrix given by poly_sparsity_order) @@ -918,7 +921,8 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! of a complex conjugate pair, as we need to know that below to add in the rest ! of the poly_sparsity_order+1 term from that pair ! before moving on to the rest of the higher order roots - call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & + call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, & + coefficients(1:poly_sparsity_order + 1, 1:2), & cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex) end if @@ -1851,32 +1855,41 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi i_sparse = size(coefficients, 1) first_complex = .FALSE. + print *, "size coeffs", size(coefficients, 1) + if (output_product) then + output_first_complex = .FALSE. if (output_product) then i_sparse = poly_sparsity_order + 1 - ! If the one before is real, then we know we're on the first - if (coefficients(i_sparse-1,2) == 0d0) then - output_first_complex = .TRUE. - ! See discussion above - i_sparse = i_sparse + 1 + ! If the last root is real we don't have to do anything + if (coefficients(i_sparse,2) /= 0d0) then - ! If the one before is complex - else - - ! Check if the distance between the fixed sparsity root and the one before is > zero - ! If so they must be complex conjugates and hence we are on the second of the pair - if (abs(coefficients(i_sparse,1) - coefficients(i_sparse-1,1))/coefficients(i_sparse,1) > 1e-14 .AND. & - abs(coefficients(i_sparse,2) + coefficients(i_sparse-1,2))/coefficients(i_sparse,2) > 1e-14) then + ! If the one before is real, then we know we're on the first + if (coefficients(i_sparse-1,2) == 0d0) then output_first_complex = .TRUE. + ! See discussion above i_sparse = i_sparse + 1 - end if + + ! If the one before is complex + else + + ! Check if the distance between the fixed sparsity root and the one before + ! If > zero then they are not complex conjugates and hence we are on the first of the pair + if (abs(coefficients(i_sparse,1) - coefficients(i_sparse-1,1))/coefficients(i_sparse,1) > 1e-14 .AND. & + abs(coefficients(i_sparse,2) + coefficients(i_sparse-1,2))/coefficients(i_sparse,2) > 1e-14) then + output_first_complex = .TRUE. + i_sparse = i_sparse + 1 + end if + end if end if end if first_complex = output_first_complex end if + print *, "i_sparse", i_sparse, "output_first_complex", output_first_complex + ! ~~~~~~~~~~~~ ! Iterate over the i ! This is basically the same as the MF application but we have to build the powers @@ -1886,6 +1899,8 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We're always building up the next product do while (i .le. i_sparse - 1) + print *, "i = ", i + ! Duplicate & copy the matrix, but ensure there is a diagonal present ! temp_mat_A is going to store things with the sparsity of A if (PetscObjectIsNull(temp_mat_A)) then @@ -1898,6 +1913,8 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! If real this is easy if (coefficients(i,2) == 0d0) then + print *, "real", "i_sparse", i_sparse + ! Skips eigenvalues that are numerically zero - see ! the comment in calculate_gmres_polynomial_roots_newton if (abs(coefficients(i,1)) < 1e-12) then @@ -1945,6 +1962,8 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Complex else + print *, "complex" + ! Skips eigenvalues that are numerically zero if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then i = i + 2 @@ -1970,6 +1989,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i > i_sparse - 2) then + print *, "outputting first part of product in complex case", "i_sparse", i_sparse, "i", i call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if @@ -2005,10 +2025,12 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi if (i .le. i_sparse - 2) then + print *, "doing complex matmult step" + ! temp_mat_three = matrix * temp_mat_two call MatMatMult(matrix, temp_mat_two, & MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr) - call MatDestroy(temp_mat_two, ierr) + call MatDestroy(temp_mat_two, ierr) ! Then add the scaled version of each product if (reuse_triggered) then @@ -2019,6 +2041,13 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Have to use the DIFFERENT_NONZERO_PATTERN here call MatAXPYWrapper(mat_product, -1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), temp_mat_three) end if + + ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply + if (output_product .AND. .NOT. first_complex) then + print *, "outputting product in complex case", "i_sparse", i_sparse, "i", i + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + end if + call MatDestroy(temp_mat_three, ierr) else call MatDestroy(temp_mat_two, ierr) @@ -2036,7 +2065,9 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Add in the final term multiplied by 1/theta_poly_order ! Skips eigenvalues that are numerically zero - if (abs(coefficients(i,1)) > 1e-12) then + if (abs(coefficients(i,1)) > 1e-12) then + + print *, "doing last real step" if (reuse_triggered) then ! If doing reuse we know our nonzeros are a subset call MatAXPY(inv_matrix, 1d0/coefficients(i,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) @@ -2049,7 +2080,9 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi end if call MatDestroy(temp_mat_A, ierr) - call MatDestroy(mat_product, ierr) + call MatDestroy(mat_product, ierr) + + !call exit(0) end subroutine build_gmres_polynomial_newton_inverse_full From ea31e5b52f850274c9d805502a70a544d1a3a67b Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 5 Feb 2026 17:13:58 +0000 Subject: [PATCH 14/41] More careful about zero eigenvalues. Also saw a case where rounding in the rank deficient compute of harmonic ritz values lead to z negative eigenvalue (-1e-16). We now explicitly check for small eigenvalues in magnitude and set them to zero --- src/Gmres_Poly_Newton.F90 | 76 +++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 23 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 2876053..d0944cc 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -309,6 +309,15 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) end if + ! In some cases with rank deficiency, we can still end up with non-zero (or negative) eigenvalues that + ! are trivially small - we set them explicitly to zero + do i_loc = 1, poly_order + 1 + if (abs(coefficients(i_loc, 1)**2 + coefficients(i_loc, 2)**2) < 1e-12) then + coefficients(i_loc, 1) = 0d0 + coefficients(i_loc, 2) = 0d0 + end if + end do + ! ~~~~~~~~~~~~~~ ! Add roots for stability ! ~~~~~~~~~~~~~~ @@ -1180,12 +1189,18 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp term = term - 1 skip_add = .TRUE. end if + ! ! If the fixed sparsity root is real and the previous root was real, + ! ! we just need to compute the correct part of the product, we just make sure not to add + ! if (coefficients(term,2) == 0d0 .AND. coefficients(term-1,2) == 0d0) then + ! skip_add = .TRUE. + ! end if + + print *, "starting loop at term ", term, "skip_add ", skip_add ! This loop skips the last coefficient do while (term .le. size(coefficients, 1) - 1) - ! We need to sum up the product of vals_previous_power_temp(j_loc) * matching columns - vals_power_temp(1:ncols) = 0 + print *, "term ", term, "coeff ", coefficients(term,1), coefficients(term,2), skip_add ! If real if (coefficients(term,2) == 0d0) then @@ -1198,9 +1213,16 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! matrix by the build_gmres_polynomial_newton_inverse_full (as we had to build the product up ! to that order) ! ~~~~~~~~~~~ - if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12 .AND. term > poly_sparsity_order + 1) then - call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & - 1d0/coefficients(term, 1) * vals_previous_power_temp(1:ncols), ADD_VALUES, ierr) + if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12 .AND. & + term > poly_sparsity_order + 1) then + + !if (.NOT. skip_add) then + print *, "CALLING SET VALUES ", term, " to row " + call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & + 1d0/coefficients(term, 1) * vals_previous_power_temp(1:ncols), ADD_VALUES, ierr) + ! else + ! skip_add = .FALSE. + ! end if end if ! Initialize with previous product before the A*prod subtraction @@ -1314,6 +1336,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Final step if last root is real if (coefficients(term,2) == 0d0) then if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12) then + print *, "adding REAL final term ", term, " coeff ", coefficients(term,1) call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & 1d0/coefficients(term, 1) * vals_power_temp(1:ncols), ADD_VALUES, ierr) end if @@ -1723,7 +1746,7 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe ! solve we don't have a zero coefficient but in the second solve we do ! So the mat type needs to remain consistent ! This can't happen in the complex case - if (coefficients(2,1) == 0d0) then + if (abs(coefficients(2,1)) < 1e-12) then ! Set to zero call MatScale(inv_matrix, 0d0, ierr) @@ -1802,6 +1825,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi logical :: reuse_triggered, output_product, first_complex integer :: i, i_sparse type(tMat) :: mat_product, temp_mat_A, temp_mat_two, temp_mat_three, mat_product_k_plus_1 + PetscReal :: square_sum, a_coeff ! ~~~~~~ @@ -1855,7 +1879,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi i_sparse = size(coefficients, 1) first_complex = .FALSE. - print *, "size coeffs", size(coefficients, 1) + print *, "size coeffs", size(coefficients, 1), "coeffs", coefficients(:, 1), coefficients(:, 2) if (output_product) then @@ -1915,11 +1939,13 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi print *, "real", "i_sparse", i_sparse - ! Skips eigenvalues that are numerically zero - see - ! the comment in calculate_gmres_polynomial_roots_newton + ! Skips eigenvalues that are numerically zero + ! We still compute the entries as as zero because we need the sparsity + ! to be correct for the next iteration if (abs(coefficients(i,1)) < 1e-12) then - i = i + 1 - cycle + square_sum = 0 + else + square_sum = 1d0/coefficients(i,1) end if ! Then add the scaled version of each product @@ -1929,15 +1955,15 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi else if (reuse_triggered) then ! If doing reuse we know our nonzeros are a subset - call MatAXPY(inv_matrix, 1d0/coefficients(i,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) + call MatAXPY(inv_matrix, square_sum, mat_product, SUBSET_NONZERO_PATTERN, ierr) else ! Have to use the DIFFERENT_NONZERO_PATTERN here - call MatAXPYWrapper(inv_matrix, 1d0/coefficients(i,1), mat_product) + call MatAXPYWrapper(inv_matrix, square_sum, mat_product) end if end if ! temp_mat_A = A_ff/theta_k - call MatScale(temp_mat_A, -1d0/coefficients(i,1), ierr) + call MatScale(temp_mat_A, -square_sum, ierr) ! temp_mat_A = I - A_ff/theta_k call MatShift(temp_mat_A, 1d0, ierr) @@ -1954,6 +1980,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i == i_sparse - 1) then + print *, "outputting product in real case", "i_sparse", i_sparse, "i", i call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if @@ -1966,8 +1993,11 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Skips eigenvalues that are numerically zero if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then - i = i + 2 - cycle + square_sum = 0 + a_coeff = 0 + else + square_sum = 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2) + a_coeff = 2d0 * coefficients(i,1) end if ! If doing the normal iteration @@ -1976,7 +2006,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! temp_mat_A = -A call MatScale(temp_mat_A, -1d0, ierr) ! temp_mat_A = 2a I - A_ff - call MatShift(temp_mat_A, 2d0 * coefficients(i,1), ierr) + call MatShift(temp_mat_A, a_coeff, ierr) if (i == 1) then ! If i == 1 then we know mat_product is identity so we can do it directly @@ -2004,7 +2034,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) ! temp_mat_two = 2a * mat_product - call MatScale(temp_mat_two, 2d0 * coefficients(i,1), ierr) + call MatScale(temp_mat_two, a_coeff, ierr) ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i > i_sparse - 2) then @@ -2016,11 +2046,11 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Then add the scaled version of each product if (reuse_triggered) then ! If doing reuse we know our nonzeros are a subset - call MatAXPY(inv_matrix, 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), & + call MatAXPY(inv_matrix, square_sum, & temp_mat_two, SUBSET_NONZERO_PATTERN, ierr) else ! Have to use the DIFFERENT_NONZERO_PATTERN here - call MatAXPYWrapper(inv_matrix, 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), temp_mat_two) + call MatAXPYWrapper(inv_matrix, square_sum, temp_mat_two) end if if (i .le. i_sparse - 2) then @@ -2035,11 +2065,11 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Then add the scaled version of each product if (reuse_triggered) then ! If doing reuse we know our nonzeros are a subset - call MatAXPY(mat_product, -1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), & + call MatAXPY(mat_product, -square_sum, & temp_mat_three, SUBSET_NONZERO_PATTERN, ierr) else ! Have to use the DIFFERENT_NONZERO_PATTERN here - call MatAXPYWrapper(mat_product, -1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), temp_mat_three) + call MatAXPYWrapper(mat_product, -square_sum, temp_mat_three) end if ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply @@ -2067,7 +2097,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Skips eigenvalues that are numerically zero if (abs(coefficients(i,1)) > 1e-12) then - print *, "doing last real step" + print *, "doing last real step, adding in term", i, "coeff", coefficients(i,1) if (reuse_triggered) then ! If doing reuse we know our nonzeros are a subset call MatAXPY(inv_matrix, 1d0/coefficients(i,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) From ff5b9148eaed428fc53f0b0f3eb92a428042f301 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 15:49:01 +0000 Subject: [PATCH 15/41] Higher order fixed sparsity is now correct. Still have to add more testing --- src/Gmres_Poly_Newton.F90 | 193 ++++++++++++++++++++++++++------------ 1 file changed, 133 insertions(+), 60 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index d0944cc..40f761a 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -834,7 +834,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp PetscInt, dimension(:), allocatable :: col_indices_off_proc_array, ad_indices, cols PetscReal, dimension(:), allocatable :: vals type(tIS), dimension(1) :: col_indices, row_indices - type(tMat) :: Ad, Ao, mat_sparsity_match + type(tMat) :: Ad, Ao, mat_sparsity_match, mat_product_save PetscInt, dimension(:), pointer :: colmap logical :: deallocate_submatrices = .FALSE. type(c_ptr) :: vals_c_ptr @@ -851,6 +851,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp PetscInt, parameter :: one = 1, zero = 0 logical :: output_first_complex, skip_add PetscReal :: square_sum + integer, dimension(poly_order + 1, 2) :: status_output, status_product ! ~~~~~~~~~~ @@ -903,7 +904,8 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp if (coefficients(1,2) == 0d0 .AND. coefficients(2,2) /= 0d0) then call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & - cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex) + cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, & + status_output, status_product, mat_product_save) else @@ -912,7 +914,8 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp call build_gmres_polynomial_newton_inverse_1st_1st(matrix, one, & coefficients(1:poly_sparsity_order + 1, 1:2), & - cmat, mat_sparsity_match) + cmat, mat_sparsity_match, & + status_output, status_product) end if else @@ -932,9 +935,18 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! before moving on to the rest of the higher order roots call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, & coefficients(1:poly_sparsity_order + 1, 1:2), & - cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex) + cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, & + status_output, status_product, mat_product_save) end if + print *, "status output real", status_output(:, 1) + print *, "status output complex", status_output(:, 2) + + print *, "sum", sum(status_output, 2) + + print *, "status product real", status_product(:, 1) + print *, "status product complex", status_product(:, 2) + ! We know we will never have non-zero locations outside of the highest constrained sparsity power call MatSetOption(cmat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) call MatSetOption(cmat, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) @@ -1188,12 +1200,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp if (coefficients(term,2) /= 0d0 .AND. .NOT. output_first_complex) then term = term - 1 skip_add = .TRUE. - end if - ! ! If the fixed sparsity root is real and the previous root was real, - ! ! we just need to compute the correct part of the product, we just make sure not to add - ! if (coefficients(term,2) == 0d0 .AND. coefficients(term-1,2) == 0d0) then - ! skip_add = .TRUE. - ! end if + end if print *, "starting loop at term ", term, "skip_add ", skip_add @@ -1205,6 +1212,8 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! If real if (coefficients(term,2) == 0d0) then + print *, "REAL CASE assembly", term + ! ~~~~~~~~~~~ ! Now can add the value to our matrix ! Can skip this if coeff is zero, but still need to compute A^(term-1) @@ -1214,19 +1223,17 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! to that order) ! ~~~~~~~~~~~ if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12 .AND. & - term > poly_sparsity_order + 1) then + status_output(term, 1) /= 1) then - !if (.NOT. skip_add) then - print *, "CALLING SET VALUES ", term, " to row " - call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & - 1d0/coefficients(term, 1) * vals_previous_power_temp(1:ncols), ADD_VALUES, ierr) - ! else - ! skip_add = .FALSE. - ! end if + print *, "ADDING IN REAL TERM ", term + call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & + 1d0/coefficients(term, 1) * vals_previous_power_temp(1:ncols), ADD_VALUES, ierr) end if ! Initialize with previous product before the A*prod subtraction - vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols) + vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols) + + print *, "DOING REAL PRODCUT for term ", term ! Have to finish all the columns before we move onto the next coefficient do j_loc = 1, ncols @@ -1246,19 +1253,19 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! If complex else + print *, "COMPLEX CASE assembly", term + square_sum = 1d0/(coefficients(term,1)**2 + coefficients(term,2)**2) if (.NOT. skip_add) then ! We skip the 2 * a * prod from the first root of a complex pair if that has already ! been included in the inv_matrix from build_gmres_polynomial_newton_inverse_full - if (term < poly_sparsity_order + 2) then - if (.NOT. output_first_complex) then - temp(1:ncols) = 2 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) - else - temp(1:ncols) = 0d0 - end if - else + if (status_output(term, 2) /= 1) then + print *, term, "adding in 2a prod" temp(1:ncols) = 2 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) + else + print *, term, "skipping adding in 2a prod" + temp(1:ncols) = 0d0 end if ! This is the -A * prod @@ -1283,28 +1290,61 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! 2aprod/a^2+b^2 ! for (c, c, r) mat product is output without the 1/a^2+b^2 but that is fine as we ! compensate for that in the product - if (term < poly_sparsity_order + 2) then + if (status_output(term, 2) == 1) then if (output_first_complex) then + print *, "ADDING IN 2a prod second time for term ", term temp(1:ncols) = temp(1:ncols) + 2d0 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) end if end if ! First time through complex pair else + + print *, "SKIP ADDING IN COMPLEX TERM ", term + !@@@ for the case where we have (r, c, c, ....) and second order sparsity + ! i think the problem is that we have to skip adding anything to p as inverse_matrix + ! already has the correct values in it, as we computed tmp which will have 2nd order terms + ! in it, but we skipped the product in the full, which is correct as that would compute 3rd order + ! terms. so the thing that gets output in mat_prod_or_tmp is tmp + ! ! If we're skipping the add, then vals_previous_power_temp has all the correct ! values in it for temp ! All we have to do is compute prod for the next time through skip_add = .FALSE. + !@@@@ so then this line sets temp to be tmp temp(1:ncols) = vals_previous_power_temp(1:ncols) + ! @@@ have to be careful here! ! If we've gone back a term, we don't have anything in prod ! prod is I when term = 1 + ! @@@@ if we're doing this for the first time, we know product is I + ! so we just set prod to be I + ! @@@@ the problem is if we're not doing this for the first time + ! we need to know what prod had in it from the previous time, as our full + ! is only outputting prod or temp, not both, because at lower order when we output + ! temp in this case we knew prod was I so we didn't have to store both + ! in the (r, c, c) case prod will have been I - 1/theta_1 A_ff from the r + ! but for it to work with the loop below vals_previous_power_temp has to contain that but + ! over the sparsity of the 2nd order term. if (term == 1) then vals_previous_power_temp(1:ncols) = 0d0 if (diag_index /= -1) then vals_previous_power_temp(diag_index) = 1d0 end if + ! In the case the mat_product_save is not the identity, we need to pull it's value out + ! We only do this once for the first term in this case + else + + call MatGetRow(mat_product_save, i_loc - 1 + global_row_start, ncols_two, & + cols_two_ptr, vals_two_ptr, ierr) + + ! We have guaranteed in the full version that mat_product_save has fixed sparsity + vals_previous_power_temp(1:ncols_two) = vals_two_ptr(1:ncols_two) + + call MatRestoreRow(mat_product_save, i_loc - 1 + global_row_start, ncols_two, & + cols_two_ptr, vals_two_ptr, ierr) + end if end if @@ -1707,7 +1747,7 @@ end subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton ! ------------------------------------------------------------------------------------------------------------------------------- subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coefficients, & - inv_matrix, mat_prod_or_temp) + inv_matrix, mat_prod_or_temp, status_output, status_product) ! Specific 1st order with 1st order sparsity @@ -1717,6 +1757,7 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients type(tMat), intent(inout) :: inv_matrix type(tMat), intent(inout), optional :: mat_prod_or_temp + integer, dimension(poly_order + 1, 2), intent(inout), optional :: status_output, status_product ! Local variables PetscErrorCode :: ierr @@ -1733,6 +1774,9 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe call MatSetOption(inv_matrix, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) call MatSetOption(inv_matrix, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) + status_output = 0 + status_product = 0 + ! We only have two coefficients, so they are either both real or complex conjugates ! If real if (coefficients(1,2) == 0d0) then @@ -1752,6 +1796,10 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe call MatScale(inv_matrix, 0d0, ierr) ! Then add in the 0th order inverse call MatShift(inv_matrix, 1d0/coefficients(1,1), ierr) + + !!@@@ need product here + print *, "CHECK/FIX THIS" + call exit(0) ! Then just return return @@ -1776,7 +1824,10 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe ! result = 1/theta_1 + 1/theta_2 * (I -A_ff/theta_1) ! Don't need an assemble as there is one called in this - call MatShift(inv_matrix, 1d0/(coefficients(1, 1)), ierr) + call MatShift(inv_matrix, 1d0/(coefficients(1, 1)), ierr) + + status_output(1:2, 1) = 1 + status_product(1,1) = 1 ! Complex conjugate roots, a +- ib else @@ -1796,6 +1847,9 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe end if ! result = 2a I - A_ff/(a^2 + b^2) call MatScale(inv_matrix, 1d0/square_sum, ierr) + + status_output(1:2, 2) = 1 + status_product(1,2) = 1 end if end subroutine build_gmres_polynomial_newton_inverse_1st_1st @@ -1804,7 +1858,8 @@ end subroutine build_gmres_polynomial_newton_inverse_1st_1st ! ------------------------------------------------------------------------------------------------------------------------------- subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & - inv_matrix, mat_prod_or_temp, poly_sparsity_order, output_first_complex) + inv_matrix, mat_prod_or_temp, poly_sparsity_order, output_first_complex, & + status_output, status_product, mat_product_save) ! No constrained sparsity by default ! If you pass in mat_prod_or_temp, poly_sparsity_order, output_first_complex @@ -1816,9 +1871,10 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi integer, intent(in) :: poly_order PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients type(tMat), intent(inout) :: inv_matrix - type(tMat), intent(inout), optional :: mat_prod_or_temp + type(tMat), intent(inout), optional :: mat_prod_or_temp, mat_product_save integer, intent(in), optional :: poly_sparsity_order logical, intent(inout), optional :: output_first_complex + integer, dimension(poly_order + 1, 2), intent(inout), optional :: status_output, status_product ! Local variables PetscErrorCode :: ierr @@ -1845,6 +1901,8 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We start with an identity in mat_product call generate_identity(matrix, mat_product) + status_output = 0 + status_product = 0 ! If we're going to output the product as part of a fixed sparsity multiply, ! we may be asking to constrain the sparsity to a power in between order and order + 2 @@ -1961,6 +2019,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatAXPYWrapper(inv_matrix, square_sum, mat_product) end if end if + status_output(i, 1) = 1 ! temp_mat_A = A_ff/theta_k call MatScale(temp_mat_A, -square_sum, ierr) @@ -1977,6 +2036,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatDestroy(mat_product, ierr) mat_product = mat_product_k_plus_1 end if + status_product(i, 1) = maxval(status_product) + 1 ! We copy out the last product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i == i_sparse - 1) then @@ -1989,7 +2049,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Complex else - print *, "complex" + print *, "complex", first_complex ! Skips eigenvalues that are numerically zero if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then @@ -2000,13 +2060,34 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi a_coeff = 2d0 * coefficients(i,1) end if - ! If doing the normal iteration - if (.NOT. first_complex) then + ! If our fixed sparsity root is the first of a complex conjugate pair + ! We want to pass out mat_product and only add that to inv_matrix + ! This is equivalent to only part of tmp on Line 9 of Loe + ! The fixed sparsity loop will then finish the tmp with the term -A * prod/(a^2+b^2) + ! as this is the part that would increase the sparsity beyond poly_sparsity_order + if (i == poly_sparsity_order + 1 .AND. first_complex) then + + ! Copy mat_product into temp_mat_two + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) + + ! temp_mat_two = 2a * mat_product + call MatScale(temp_mat_two, a_coeff, ierr) + status_output(i, 2) = 1 + + ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply + if (output_product .AND. i > i_sparse - 2) then + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + end if + + ! Just do the normal loop + else ! temp_mat_A = -A call MatScale(temp_mat_A, -1d0, ierr) ! temp_mat_A = 2a I - A_ff call MatShift(temp_mat_A, a_coeff, ierr) + status_output(i, 2) = 1 + status_output(i+1, 2) = 1 if (i == 1) then ! If i == 1 then we know mat_product is identity so we can do it directly @@ -2016,31 +2097,22 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatMatMult(temp_mat_A, mat_product, & MAT_INITIAL_MATRIX, 1.5d0, temp_mat_two, ierr) end if + status_product(i, 2) = maxval(status_product) + 1 ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i > i_sparse - 2) then - print *, "outputting first part of product in complex case", "i_sparse", i_sparse, "i", i - call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) - end if - - ! If instead we only have the first of a complex conjugate pair - ! We want to pass out mat_product and only add that to inv_matrix - ! This is equivalent to only part of tmp on Line 9 of Loe - ! The fixed sparsity loop will then finish the tmp with the term -A * prod/(a^2+b^2) - ! as this is the part that would increase the sparsity beyond poly_sparsity_order - else - - ! Copy mat_product into temp_mat_two - call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) - - ! temp_mat_two = 2a * mat_product - call MatScale(temp_mat_two, a_coeff, ierr) - - ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply - if (output_product .AND. i > i_sparse - 2) then - call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) - end if - + print *, "outputting TEMP in complex case", "i_sparse", i_sparse, "i", i + call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + ! If i == 1 then we know mat_product is the identity and we don't bother + ! to write it out, we just have some custom code in the product given its trivial + if (i /= 1) then + ! This ensures it has the matching sparsity + call MatConvert(mat_prod_or_temp, MATSAME, MAT_INITIAL_MATRIX, mat_product_save, ierr) + ! This zeros mat_product_save and then puts mat_product into the sparsity pattern + ! of mat_prod_or_temp + call MatCopy(mat_product, mat_product_save, DIFFERENT_NONZERO_PATTERN, ierr) + end if + end if end if ! Then add the scaled version of each product @@ -2060,7 +2132,9 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! temp_mat_three = matrix * temp_mat_two call MatMatMult(matrix, temp_mat_two, & MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr) - call MatDestroy(temp_mat_two, ierr) + call MatDestroy(temp_mat_two, ierr) + status_output(i, 2) = 1 + status_product(i+1, 2) = maxval(status_product) + 1 ! Then add the scaled version of each product if (reuse_triggered) then @@ -2075,7 +2149,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. .NOT. first_complex) then print *, "outputting product in complex case", "i_sparse", i_sparse, "i", i - call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if call MatDestroy(temp_mat_three, ierr) @@ -2105,14 +2179,13 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Have to use the DIFFERENT_NONZERO_PATTERN here call MatAXPYWrapper(inv_matrix, 1d0/coefficients(i,1), mat_product) end if + status_output(i, 1) = 1 end if end if end if call MatDestroy(temp_mat_A, ierr) call MatDestroy(mat_product, ierr) - - !call exit(0) end subroutine build_gmres_polynomial_newton_inverse_full From e9e654a94078b2a80c4a1aba760467fa06f57449 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 16:36:43 +0000 Subject: [PATCH 16/41] Output was wrong in gmres polynomial comparison test --- tests/ex12f_gmres_poly.F90 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ex12f_gmres_poly.F90 b/tests/ex12f_gmres_poly.F90 index bdbf8ce..dc597bb 100644 --- a/tests/ex12f_gmres_poly.F90 +++ b/tests/ex12f_gmres_poly.F90 @@ -148,18 +148,18 @@ program main ! A_ff on each level they should be basically identical and hence ! we should have almost no difference in the resulting residual ! ~~~~~~~~~~~~~ - norm_diff_one = abs(norm_power - norm_newton)/norm_newton + norm_diff_one = abs(norm_arnoldi - norm_newton)/norm_arnoldi if (norm_diff_one > 1e-9) then print *, "Residuals differ between polynomial bases!", norm_diff_one - print *, "Power basis residual: ", norm_power print *, "Newton basis residual: ", norm_newton + print *, "Arnoldi basis residual: ", norm_arnoldi error stop 1 end if - norm_diff_two = abs(norm_arnoldi - norm_power)/norm_power + norm_diff_two = abs(norm_arnoldi - norm_power)/norm_arnoldi if (norm_diff_two > 1e-9) then print *, "Residuals differ between polynomial bases!", norm_diff_two + print *, "Power basis residual: ", norm_power print *, "Arnoldi basis residual: ", norm_arnoldi - print *, "Newton basis residual: ", norm_newton error stop 1 end if From 0aeed7e03a84536fb6faa3d962d3b3d60529c624 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 16:37:03 +0000 Subject: [PATCH 17/41] Add tests for fixed sparsity --- src/Gmres_Poly_Newton.F90 | 112 ++++++++++++++++++++++---------------- tests/Makefile | 7 ++- 2 files changed, 69 insertions(+), 50 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 40f761a..8fd36d7 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -919,8 +919,8 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp end if else - print *,"reals", coefficients(:,1) - print *,"imags", coefficients(:,2) + ! print *,"reals", coefficients(:,1) + ! print *,"imags", coefficients(:,2) ! If we're any higher, then we build cmat up to that order ! But we have to be careful because the last root we want to explicitly @@ -939,13 +939,13 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp status_output, status_product, mat_product_save) end if - print *, "status output real", status_output(:, 1) - print *, "status output complex", status_output(:, 2) + ! print *, "status output real", status_output(:, 1) + ! print *, "status output complex", status_output(:, 2) - print *, "sum", sum(status_output, 2) + ! print *, "sum", sum(status_output, 2) - print *, "status product real", status_product(:, 1) - print *, "status product complex", status_product(:, 2) + ! print *, "status product real", status_product(:, 1) + ! print *, "status product complex", status_product(:, 2) ! We know we will never have non-zero locations outside of the highest constrained sparsity power call MatSetOption(cmat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) @@ -1202,17 +1202,17 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp skip_add = .TRUE. end if - print *, "starting loop at term ", term, "skip_add ", skip_add + !print *, "starting loop at term ", term, "skip_add ", skip_add ! This loop skips the last coefficient do while (term .le. size(coefficients, 1) - 1) - print *, "term ", term, "coeff ", coefficients(term,1), coefficients(term,2), skip_add + !print *, "term ", term, "coeff ", coefficients(term,1), coefficients(term,2), skip_add ! If real if (coefficients(term,2) == 0d0) then - print *, "REAL CASE assembly", term + !print *, "REAL CASE assembly", term ! ~~~~~~~~~~~ ! Now can add the value to our matrix @@ -1225,7 +1225,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12 .AND. & status_output(term, 1) /= 1) then - print *, "ADDING IN REAL TERM ", term + !print *, "ADDING IN REAL TERM ", term call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & 1d0/coefficients(term, 1) * vals_previous_power_temp(1:ncols), ADD_VALUES, ierr) end if @@ -1233,7 +1233,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Initialize with previous product before the A*prod subtraction vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols) - print *, "DOING REAL PRODCUT for term ", term + !print *, "DOING REAL PRODCUT for term ", term ! Have to finish all the columns before we move onto the next coefficient do j_loc = 1, ncols @@ -1253,7 +1253,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! If complex else - print *, "COMPLEX CASE assembly", term + !print *, "COMPLEX CASE assembly", term square_sum = 1d0/(coefficients(term,1)**2 + coefficients(term,2)**2) if (.NOT. skip_add) then @@ -1261,10 +1261,10 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! We skip the 2 * a * prod from the first root of a complex pair if that has already ! been included in the inv_matrix from build_gmres_polynomial_newton_inverse_full if (status_output(term, 2) /= 1) then - print *, term, "adding in 2a prod" + !print *, term, "adding in 2a prod" temp(1:ncols) = 2 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) else - print *, term, "skipping adding in 2a prod" + !print *, term, "skipping adding in 2a prod" temp(1:ncols) = 0d0 end if @@ -1292,7 +1292,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! compensate for that in the product if (status_output(term, 2) == 1) then if (output_first_complex) then - print *, "ADDING IN 2a prod second time for term ", term + !print *, "ADDING IN 2a prod second time for term ", term temp(1:ncols) = temp(1:ncols) + 2d0 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) end if end if @@ -1300,7 +1300,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! First time through complex pair else - print *, "SKIP ADDING IN COMPLEX TERM ", term + !print *, "SKIP ADDING IN COMPLEX TERM ", term !@@@ for the case where we have (r, c, c, ....) and second order sparsity ! i think the problem is that we have to skip adding anything to p as inverse_matrix ! already has the correct values in it, as we computed tmp which will have 2nd order terms @@ -1376,7 +1376,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Final step if last root is real if (coefficients(term,2) == 0d0) then if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12) then - print *, "adding REAL final term ", term, " coeff ", coefficients(term,1) + !print *, "adding REAL final term ", term, " coeff ", coefficients(term,1) call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & 1d0/coefficients(term, 1) * vals_power_temp(1:ncols), ADD_VALUES, ierr) end if @@ -1774,8 +1774,10 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe call MatSetOption(inv_matrix, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) call MatSetOption(inv_matrix, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) - status_output = 0 - status_product = 0 + if (output_product) then + status_output = 0 + status_product = 0 + end if ! We only have two coefficients, so they are either both real or complex conjugates ! If real @@ -1826,8 +1828,10 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe ! Don't need an assemble as there is one called in this call MatShift(inv_matrix, 1d0/(coefficients(1, 1)), ierr) - status_output(1:2, 1) = 1 - status_product(1,1) = 1 + if (output_product) then + status_output(1:2, 1) = 1 + status_product(1,1) = 1 + end if ! Complex conjugate roots, a +- ib else @@ -1848,8 +1852,10 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe ! result = 2a I - A_ff/(a^2 + b^2) call MatScale(inv_matrix, 1d0/square_sum, ierr) - status_output(1:2, 2) = 1 - status_product(1,2) = 1 + if (output_product) then + status_output(1:2, 2) = 1 + status_product(1,2) = 1 + end if end if end subroutine build_gmres_polynomial_newton_inverse_1st_1st @@ -1879,7 +1885,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Local variables PetscErrorCode :: ierr logical :: reuse_triggered, output_product, first_complex - integer :: i, i_sparse + integer :: i, i_sparse, sparsity_order type(tMat) :: mat_product, temp_mat_A, temp_mat_two, temp_mat_three, mat_product_k_plus_1 PetscReal :: square_sum, a_coeff @@ -1901,8 +1907,14 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We start with an identity in mat_product call generate_identity(matrix, mat_product) - status_output = 0 - status_product = 0 + if (output_product) then + status_output = 0 + status_product = 0 + end if + sparsity_order = poly_order + if (present(poly_sparsity_order)) then + sparsity_order = poly_sparsity_order + end if ! If we're going to output the product as part of a fixed sparsity multiply, ! we may be asking to constrain the sparsity to a power in between order and order + 2 @@ -1937,13 +1949,13 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi i_sparse = size(coefficients, 1) first_complex = .FALSE. - print *, "size coeffs", size(coefficients, 1), "coeffs", coefficients(:, 1), coefficients(:, 2) + !print *, "size coeffs", size(coefficients, 1), "coeffs", coefficients(:, 1), coefficients(:, 2) if (output_product) then output_first_complex = .FALSE. if (output_product) then - i_sparse = poly_sparsity_order + 1 + i_sparse = sparsity_order + 1 ! If the last root is real we don't have to do anything if (coefficients(i_sparse,2) /= 0d0) then @@ -1970,7 +1982,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi first_complex = output_first_complex end if - print *, "i_sparse", i_sparse, "output_first_complex", output_first_complex + !print *, "i_sparse", i_sparse, "output_first_complex", output_first_complex ! ~~~~~~~~~~~~ ! Iterate over the i @@ -1981,7 +1993,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We're always building up the next product do while (i .le. i_sparse - 1) - print *, "i = ", i + !print *, "i = ", i ! Duplicate & copy the matrix, but ensure there is a diagonal present ! temp_mat_A is going to store things with the sparsity of A @@ -1995,7 +2007,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! If real this is easy if (coefficients(i,2) == 0d0) then - print *, "real", "i_sparse", i_sparse + !print *, "real", "i_sparse", i_sparse ! Skips eigenvalues that are numerically zero ! We still compute the entries as as zero because we need the sparsity @@ -2019,7 +2031,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatAXPYWrapper(inv_matrix, square_sum, mat_product) end if end if - status_output(i, 1) = 1 + if (output_product) status_output(i, 1) = 1 ! temp_mat_A = A_ff/theta_k call MatScale(temp_mat_A, -square_sum, ierr) @@ -2036,11 +2048,11 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatDestroy(mat_product, ierr) mat_product = mat_product_k_plus_1 end if - status_product(i, 1) = maxval(status_product) + 1 + if (output_product) status_product(i, 1) = maxval(status_product) + 1 ! We copy out the last product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i == i_sparse - 1) then - print *, "outputting product in real case", "i_sparse", i_sparse, "i", i + !print *, "outputting product in real case", "i_sparse", i_sparse, "i", i call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if @@ -2049,7 +2061,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Complex else - print *, "complex", first_complex + !print *, "complex", first_complex ! Skips eigenvalues that are numerically zero if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then @@ -2065,14 +2077,14 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! This is equivalent to only part of tmp on Line 9 of Loe ! The fixed sparsity loop will then finish the tmp with the term -A * prod/(a^2+b^2) ! as this is the part that would increase the sparsity beyond poly_sparsity_order - if (i == poly_sparsity_order + 1 .AND. first_complex) then + if (i == sparsity_order + 1 .AND. first_complex) then ! Copy mat_product into temp_mat_two call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) ! temp_mat_two = 2a * mat_product call MatScale(temp_mat_two, a_coeff, ierr) - status_output(i, 2) = 1 + if (output_product) status_output(i, 2) = 1 ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i > i_sparse - 2) then @@ -2086,8 +2098,10 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatScale(temp_mat_A, -1d0, ierr) ! temp_mat_A = 2a I - A_ff call MatShift(temp_mat_A, a_coeff, ierr) - status_output(i, 2) = 1 - status_output(i+1, 2) = 1 + if (output_product) then + status_output(i, 2) = 1 + status_output(i+1, 2) = 1 + end if if (i == 1) then ! If i == 1 then we know mat_product is identity so we can do it directly @@ -2097,11 +2111,11 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatMatMult(temp_mat_A, mat_product, & MAT_INITIAL_MATRIX, 1.5d0, temp_mat_two, ierr) end if - status_product(i, 2) = maxval(status_product) + 1 + if (output_product) status_product(i, 2) = maxval(status_product) + 1 ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i > i_sparse - 2) then - print *, "outputting TEMP in complex case", "i_sparse", i_sparse, "i", i + !print *, "outputting TEMP in complex case", "i_sparse", i_sparse, "i", i call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) ! If i == 1 then we know mat_product is the identity and we don't bother ! to write it out, we just have some custom code in the product given its trivial @@ -2127,14 +2141,16 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi if (i .le. i_sparse - 2) then - print *, "doing complex matmult step" + !print *, "doing complex matmult step" ! temp_mat_three = matrix * temp_mat_two call MatMatMult(matrix, temp_mat_two, & MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr) call MatDestroy(temp_mat_two, ierr) - status_output(i, 2) = 1 - status_product(i+1, 2) = maxval(status_product) + 1 + if (output_product) then + status_output(i, 2) = 1 + status_product(i+1, 2) = maxval(status_product) + 1 + end if ! Then add the scaled version of each product if (reuse_triggered) then @@ -2148,7 +2164,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. .NOT. first_complex) then - print *, "outputting product in complex case", "i_sparse", i_sparse, "i", i + !print *, "outputting product in complex case", "i_sparse", i_sparse, "i", i call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if @@ -2171,7 +2187,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Skips eigenvalues that are numerically zero if (abs(coefficients(i,1)) > 1e-12) then - print *, "doing last real step, adding in term", i, "coeff", coefficients(i,1) + !print *, "doing last real step, adding in term", i, "coeff", coefficients(i,1) if (reuse_triggered) then ! If doing reuse we know our nonzeros are a subset call MatAXPY(inv_matrix, 1d0/coefficients(i,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) @@ -2179,7 +2195,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Have to use the DIFFERENT_NONZERO_PATTERN here call MatAXPYWrapper(inv_matrix, 1d0/coefficients(i,1), mat_product) end if - status_output(i, 1) = 1 + if (output_product) status_output(i, 1) = 1 end if end if end if diff --git a/tests/Makefile b/tests/Makefile index 5a5081f..47a11c3 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -134,10 +134,13 @@ run_tests_load_serial: echo "--- Testing order = $$order ---"; \ ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \ done - @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders first order fixed sparsity" + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders and fixed sparsity in parallel" @for order in 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ - ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order; \ + for sparsity in $$(seq 1 $$(($$order - 1))); do \ + echo " --- Testing sparsity order = $$sparsity ---"; \ + ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$sparsity; \ + done; \ done # ~~~~~~~~~~~ From 798b1350e35e2b3e92ce0b12dd3e0fca900a0a3d Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 18:34:44 +0000 Subject: [PATCH 18/41] Fix 1st order sparsity --- src/Gmres_Poly_Newton.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 8fd36d7..efa811f 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -1757,7 +1757,7 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients type(tMat), intent(inout) :: inv_matrix type(tMat), intent(inout), optional :: mat_prod_or_temp - integer, dimension(poly_order + 1, 2), intent(inout), optional :: status_output, status_product + integer, dimension(:, :), intent(inout), optional :: status_output, status_product ! Local variables PetscErrorCode :: ierr @@ -1880,7 +1880,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi type(tMat), intent(inout), optional :: mat_prod_or_temp, mat_product_save integer, intent(in), optional :: poly_sparsity_order logical, intent(inout), optional :: output_first_complex - integer, dimension(poly_order + 1, 2), intent(inout), optional :: status_output, status_product + integer, dimension(:, :), intent(inout), optional :: status_output, status_product ! Local variables PetscErrorCode :: ierr From 0bf4cd9b88e7e06cc57bf8aea0df40dc345b2106 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 18:56:07 +0000 Subject: [PATCH 19/41] Fixed 1st order case where (r,r) but second is zero --- src/Gmres_Poly_Newton.F90 | 43 +++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index efa811f..6c437ce 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -851,7 +851,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp PetscInt, parameter :: one = 1, zero = 0 logical :: output_first_complex, skip_add PetscReal :: square_sum - integer, dimension(poly_order + 1, 2) :: status_output, status_product + integer, dimension(poly_order + 1, 2) :: status_output ! ~~~~~~~~~~ @@ -905,7 +905,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, & - status_output, status_product, mat_product_save) + status_output, mat_product_save) else @@ -915,7 +915,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp call build_gmres_polynomial_newton_inverse_1st_1st(matrix, one, & coefficients(1:poly_sparsity_order + 1, 1:2), & cmat, mat_sparsity_match, & - status_output, status_product) + status_output) end if else @@ -936,16 +936,13 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, & coefficients(1:poly_sparsity_order + 1, 1:2), & cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, & - status_output, status_product, mat_product_save) + status_output, mat_product_save) end if ! print *, "status output real", status_output(:, 1) ! print *, "status output complex", status_output(:, 2) - ! print *, "sum", sum(status_output, 2) - - ! print *, "status product real", status_product(:, 1) - ! print *, "status product complex", status_product(:, 2) + ! print *, "sum", sum(status_output, 2) ! We know we will never have non-zero locations outside of the highest constrained sparsity power call MatSetOption(cmat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) @@ -1747,7 +1744,7 @@ end subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton ! ------------------------------------------------------------------------------------------------------------------------------- subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coefficients, & - inv_matrix, mat_prod_or_temp, status_output, status_product) + inv_matrix, mat_prod_or_temp, status_output) ! Specific 1st order with 1st order sparsity @@ -1757,7 +1754,7 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients type(tMat), intent(inout) :: inv_matrix type(tMat), intent(inout), optional :: mat_prod_or_temp - integer, dimension(:, :), intent(inout), optional :: status_output, status_product + integer, dimension(:, :), intent(inout), optional :: status_output ! Local variables PetscErrorCode :: ierr @@ -1776,7 +1773,6 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe if (output_product) then status_output = 0 - status_product = 0 end if ! We only have two coefficients, so they are either both real or complex conjugates @@ -1796,12 +1792,17 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe ! Set to zero call MatScale(inv_matrix, 0d0, ierr) - ! Then add in the 0th order inverse - call MatShift(inv_matrix, 1d0/coefficients(1,1), ierr) - !!@@@ need product here - print *, "CHECK/FIX THIS" - call exit(0) + ! Tricky case here as we want to pass out the identity with the + ! sparsity of A + if (output_product) then + call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + call MatShift(mat_prod_or_temp, 1d0, ierr) + status_output(1:2, 1) = 1 + end if + + ! Then add in the 0th order inverse + call MatShift(inv_matrix, 1d0/coefficients(1,1), ierr) ! Then just return return @@ -1830,7 +1831,6 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe if (output_product) then status_output(1:2, 1) = 1 - status_product(1,1) = 1 end if ! Complex conjugate roots, a +- ib @@ -1854,7 +1854,6 @@ subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coe if (output_product) then status_output(1:2, 2) = 1 - status_product(1,2) = 1 end if end if @@ -1865,7 +1864,7 @@ end subroutine build_gmres_polynomial_newton_inverse_1st_1st subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & inv_matrix, mat_prod_or_temp, poly_sparsity_order, output_first_complex, & - status_output, status_product, mat_product_save) + status_output, mat_product_save) ! No constrained sparsity by default ! If you pass in mat_prod_or_temp, poly_sparsity_order, output_first_complex @@ -1880,7 +1879,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi type(tMat), intent(inout), optional :: mat_prod_or_temp, mat_product_save integer, intent(in), optional :: poly_sparsity_order logical, intent(inout), optional :: output_first_complex - integer, dimension(:, :), intent(inout), optional :: status_output, status_product + integer, dimension(:, :), intent(inout), optional :: status_output ! Local variables PetscErrorCode :: ierr @@ -1909,7 +1908,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call generate_identity(matrix, mat_product) if (output_product) then status_output = 0 - status_product = 0 end if sparsity_order = poly_order if (present(poly_sparsity_order)) then @@ -2048,7 +2046,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatDestroy(mat_product, ierr) mat_product = mat_product_k_plus_1 end if - if (output_product) status_product(i, 1) = maxval(status_product) + 1 ! We copy out the last product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i == i_sparse - 1) then @@ -2111,7 +2108,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatMatMult(temp_mat_A, mat_product, & MAT_INITIAL_MATRIX, 1.5d0, temp_mat_two, ierr) end if - if (output_product) status_product(i, 2) = maxval(status_product) + 1 ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i > i_sparse - 2) then @@ -2149,7 +2145,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi call MatDestroy(temp_mat_two, ierr) if (output_product) then status_output(i, 2) = 1 - status_product(i+1, 2) = maxval(status_product) + 1 end if ! Then add the scaled version of each product From bbbbac31a4df0409c7a003082c85b142a29b1b69 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 21:44:04 +0000 Subject: [PATCH 20/41] Added eigenvalue clustering to improve stbility of Newton form of GMRES polynomials. Also ensure all the exact zero eigenvalues are moved to the end of the coefficients array. --- src/Gmres_Poly_Newton.F90 | 495 ++++++++++++++++++++++++++++---------- tests/Makefile | 49 +++- 2 files changed, 409 insertions(+), 135 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 6c437ce..66bc5c8 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -135,6 +135,240 @@ subroutine modified_leja(real_roots, imag_roots, indices) end subroutine modified_leja + ! ------------------------------------------------------------------------------------------------- + + subroutine cluster_eigenvalues_stable(real_roots, imag_roots, rel_tol, abs_tol) + + ! Robust clustering of (possibly complex) harmonic Ritz values. + ! Numerically distinct clusters are moved to the front. + ! Remaining entries are set to zero. + ! Skips eigenvalues that are exactly zero (both real and imag parts). + ! + ! Inputs: + ! real_roots, imag_roots : eigenvalues (length k) + ! rel_tol : relative tolerance (suggest sqrt(eps) ~ 1e-8) + ! abs_tol : absolute tolerance (suggest eps * ||H|| ~ 1e-15) + ! + ! Outputs: + ! real_roots, imag_roots : clustered eigenvalues at front, zeros after + + PetscReal, dimension(:), intent(inout) :: real_roots, imag_roots + PetscReal, intent(in) :: rel_tol, abs_tol + integer :: i, j, n, n_unique, cluster_size + logical, allocatable :: used(:) + PetscReal :: dist, mag_i, mag_j, scale + PetscReal :: sum_real, sum_imag + PetscReal, allocatable :: rtmp(:), itmp(:) + + n = size(real_roots) + + allocate(used(n)) + allocate(rtmp(n), itmp(n)) + + used = .false. + n_unique = 0 + + ! --------------------------------------------------------- + ! All-pairs clustering (no sorting to preserve proximity) + ! --------------------------------------------------------- + do i = 1, n + + if (used(i)) cycle + + ! Skip eigenvalues that are exactly zero + if (real_roots(i) == 0.0d0 .AND. imag_roots(i) == 0.0d0) then + used(i) = .true. + cycle + end if + + ! Start new cluster with eigenvalue i + sum_real = real_roots(i) + sum_imag = imag_roots(i) + cluster_size = 1 + used(i) = .true. + + mag_i = sqrt(real_roots(i)**2 + imag_roots(i)**2) + + ! Look for all other eigenvalues close to this one + do j = i + 1, n + + if (used(j)) cycle + + ! Skip exactly zero eigenvalues + if (real_roots(j) == 0.0d0 .AND. imag_roots(j) == 0.0d0) then + used(j) = .true. + cycle + end if + + mag_j = sqrt(real_roots(j)**2 + imag_roots(j)**2) + + ! Distance between eigenvalues + dist = sqrt((real_roots(j) - real_roots(i))**2 + & + (imag_roots(j) - imag_roots(i))**2) + + ! Use the larger magnitude for relative scaling + scale = max(mag_i, mag_j, 1.0d0) + + ! Check if within tolerance + if (dist <= abs_tol + rel_tol * scale) then + sum_real = sum_real + real_roots(j) + sum_imag = sum_imag + imag_roots(j) + cluster_size = cluster_size + 1 + used(j) = .true. + end if + + end do + + ! Compute cluster centroid (mean) + n_unique = n_unique + 1 + rtmp(n_unique) = sum_real / dble(cluster_size) + itmp(n_unique) = sum_imag / dble(cluster_size) + + end do + + ! --------------------------------------------------------- + ! Output compact form + ! --------------------------------------------------------- + real_roots = 0.0d0 + imag_roots = 0.0d0 + + real_roots(1:n_unique) = rtmp(1:n_unique) + imag_roots(1:n_unique) = itmp(1:n_unique) + + deallocate(used, rtmp, itmp) + + end subroutine cluster_eigenvalues_stable + + + ! ------------------------------------------------------------------------------------------------- + + subroutine compute_extra_roots(real_roots, imag_roots, real_roots_output, imag_roots_output) + + ! Add extra roots for stability + ! Computes the product of factors for each eigenvalue and adds extra copies + ! of roots that have large products (to improve polynomial stability) + ! Only non-zero eigenvalues should be passed in + ! real_roots_output, imag_roots_output are allocated and filled with the original + ! roots plus any extra copies, with perturbed values for the leja sort + + ! ~~~~~~ + PetscReal, dimension(:), intent(inout) :: real_roots, imag_roots + PetscReal, dimension(:), allocatable, intent(inout) :: real_roots_output, imag_roots_output + + ! Local variables + integer :: i_loc, j_loc, k_loc, n_roots, total_extra, counter + PetscReal :: a, b, c, d, div_real, div_imag, div_mag + PetscReal, dimension(size(real_roots)) :: pof + integer, dimension(size(real_roots)) :: extra_pair_roots, overflow + + ! ~~~~~~ + + n_roots = size(real_roots) + + ! Compute the product of factors + pof = 1 + extra_pair_roots = 0 + overflow = 0 + total_extra = 0 + do k_loc = 1, n_roots + + a = real_roots(k_loc) + b = imag_roots(k_loc) + + ! We have already computed pof for the positive imaginary complex conjugate + if (b < 0) cycle + + ! Skips eigenvalues that are numerically zero + if (abs(a) < 1e-12) cycle + if (a**2 + b**2 < 1e-12) cycle + + ! Compute product(k)_{i, j/=i} * | 1 - theta_j/theta_i| + do i_loc = 1, n_roots + + ! Skip + if (k_loc == i_loc) cycle + + c = real_roots(i_loc) + d = imag_roots(i_loc) + + ! Skips eigenvalues that are numerically zero + if (abs(c) < 1e-12) cycle + if (c**2 + d**2 < 1e-12) cycle + + ! theta_k/theta_i + div_real = (a * c + b * d)/(c**2 + d**2) + div_imag = (b * c - a * d)/(c**2 + d**2) + + ! |1 - theta_k/theta_i| + div_mag = sqrt((1 - div_real)**2 + div_imag**2) + + ! Pof is about to overflow, store the exponent and + ! reset pof back to one + ! We can hit this for very high order polynomials, where we have to + ! add more roots than 22 (ie pof > 1e308) + if (log10(pof(k_loc)) + log10(div_mag) > 307) then + overflow(k_loc) = overflow(k_loc) + int(log10(pof(k_loc))) + pof(k_loc) = 1 + end if + + ! Product + pof(k_loc) = pof(k_loc) * div_mag + + end do + + ! If pof > 1e4, we add an extra root, plus one extra for every 1e14 + if (log10(pof(k_loc)) > 4 .OR. overflow(k_loc) /= 0) then + + ! if real extra_pair_roots counts each distinct real root we're adding + ! if imaginary it only counts a pair as one + extra_pair_roots(k_loc) = ceiling((log10(pof(k_loc)) + overflow(k_loc) - 4.0)/14.0) + total_extra = total_extra + extra_pair_roots(k_loc) + + ! If imaginary, the pof is the same for the conjugate, let's just set it to -1 + if (b > 0) then + ! We know the positive imaginary value is first, so the conjugate follows it + pof(k_loc+1) = -1 + ! We need the conjugates as well + total_extra = total_extra + extra_pair_roots(k_loc) + + end if + end if + end do + + ! Allocate output arrays (original roots + extra roots) + allocate(real_roots_output(n_roots + total_extra)) + allocate(imag_roots_output(n_roots + total_extra)) + real_roots_output = 0d0 + imag_roots_output = 0d0 + + ! Copy in original roots + real_roots_output(1:n_roots) = real_roots(1:n_roots) + imag_roots_output(1:n_roots) = imag_roots(1:n_roots) + + ! Add the extra copies of roots, ensuring conjugate pairs we add + ! are next to each other + counter = n_roots + 1 + do i_loc = 1, n_roots + + ! For each extra root pair to add + do j_loc = 1, extra_pair_roots(i_loc) + + real_roots_output(counter) = real_roots(i_loc) + imag_roots_output(counter) = imag_roots(i_loc) + ! Add in the conjugate + if (imag_roots(i_loc) > 0) then + real_roots_output(counter+1) = real_roots(i_loc) + imag_roots_output(counter+1) = -imag_roots(i_loc) + end if + + counter = counter + 1 + if (imag_roots(i_loc) > 0) counter = counter + 1 + end do + end do + + end subroutine compute_extra_roots + + ! ------------------------------------------------------------------------------------------------------------------------------- subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots, coefficients) @@ -160,22 +394,22 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots ! Local variables PetscInt :: global_rows, global_cols, local_rows, local_cols integer :: lwork, subspace_size, rank, i_loc, comm_size, comm_rank, errorcode, iwork_size, j_loc - integer :: total_extra, counter, k_loc, m + integer :: total_extra, counter, k_loc, m, numerical_order PetscErrorCode :: ierr MPIU_Comm :: MPI_COMM_MATRIX PetscReal, dimension(poly_order+2,poly_order+1) :: H_n PetscReal, dimension(poly_order+1,poly_order+2) :: H_n_T - PetscReal, dimension(poly_order+1) :: e_d, solution, s, pof - integer, dimension(poly_order+1) :: extra_pair_roots, overflow + PetscReal, dimension(poly_order+1) :: e_d, solution, s integer, dimension(:), allocatable :: iwork_allocated, indices - PetscReal, dimension(:), allocatable :: work + PetscReal, dimension(:), allocatable :: work, real_roots_added, imag_roots_added + PetscReal, dimension(:), allocatable :: perturbed_real, perturbed_imag PetscReal, dimension(:,:), allocatable :: VL, VR - PetscReal :: beta, div_real, div_imag, a, b, c, d, div_mag + PetscReal :: beta PetscReal, dimension(:, :), allocatable :: coefficients_temp type(tVec) :: w_j type(tVec), dimension(poly_order+2) :: V_n logical :: use_harmonic_ritz = .TRUE. - PetscReal :: rcond = 1e-12 + PetscReal :: rcond = 1e-12, rel_tol, abs_tol, H_norm ! ~~~~~~ @@ -264,8 +498,8 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots call dgelsd(poly_order + 1, poly_order + 1, 1, H_n_T, size(H_n_T, 1), & e_d, size(e_d), s, rcond, rank, & work, lwork, iwork_allocated, errorcode) - deallocate(work, iwork_allocated) - + deallocate(work, iwork_allocated) + ! Copy in the solution solution = e_d @@ -309,154 +543,150 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) end if - ! In some cases with rank deficiency, we can still end up with non-zero (or negative) eigenvalues that + ! print *, "coefficients r", coefficients(:, 1) + ! print *, "coefficients c", coefficients(:, 2) + + ! These are the tolerances that control the clustering + H_norm = norm2(H_n(1:m,1:m)) + rel_tol = 1.0d0 * sqrt(epsilon(1.0d0)) + abs_tol = epsilon(1.0d0) * max(H_norm, beta) + + !print *, "H_norm", H_norm, "rel_tol", rel_tol, "abs_tol", abs_tol + + ! In some cases with numerical rank deficiency, we can still + ! end up with non-zero (or negative) eigenvalues that ! are trivially small - we set them explicitly to zero do i_loc = 1, poly_order + 1 - if (abs(coefficients(i_loc, 1)**2 + coefficients(i_loc, 2)**2) < 1e-12) then + if (coefficients(i_loc,1)**2 + coefficients(i_loc,2)**2 < & + (abs_tol + rel_tol*H_norm)**2) then coefficients(i_loc, 1) = 0d0 coefficients(i_loc, 2) = 0d0 end if end do - ! ~~~~~~~~~~~~~~ - ! Add roots for stability - ! ~~~~~~~~~~~~~~ - if (add_roots) then - - ! Compute the product of factors - pof = 1 - extra_pair_roots = 0 - overflow = 0 - total_extra = 0 - do k_loc = 1, poly_order + 1 + ! print *, "after zero coefficients r", coefficients(:, 1) + ! print *, "after zero coefficients c", coefficients(:, 2) + + ! Cluster close eigenvalues together to improve stability of the polynomial evaluation + call cluster_eigenvalues_stable(coefficients(:, 1), coefficients(:, 2), rel_tol, abs_tol) + + ! print *, "after cluster coefficients r", coefficients(:, 1) + ! print *, "after cluster coefficients c", coefficients(:, 2) - a = coefficients(k_loc, 1) - b = coefficients(k_loc, 2) - - ! We have already computed pof for the positive imaginary complex conjugate - if (b < 0) cycle + ! ~~~~~~~~~~~~~~ + ! Extract the non-zero eigenvalues for root adding and leja ordering + ! Zero eigenvalues will be appended at the end + ! ~~~~~~~~~~~~~~ + ! Count the number of non-zero eigenvalues after clustering + numerical_order = 0 + do i_loc = 1, poly_order + 1 + if (coefficients(i_loc, 1) /= 0d0 .OR. coefficients(i_loc, 2) /= 0d0) then + numerical_order = numerical_order + 1 + end if + end do - ! Skips eigenvalues that are numerically zero - if (abs(a) < 1e-12) cycle - if (a**2 + b**2 < 1e-12) cycle + ! ~~~~~~~~~~~~~~ + ! Add roots for stability (only on non-zero eigenvalues) + ! ~~~~~~~~~~~~~~ + if (add_roots .AND. numerical_order > 0) then - ! Compute product(k)_{i, j/=i} * | 1 - theta_j/theta_i| - do i_loc = 1, poly_order + 1 + ! Extract non-zero eigenvalues into a temporary array + allocate(coefficients_temp(numerical_order, 2)) + counter = 0 + do i_loc = 1, poly_order + 1 + if (coefficients(i_loc, 1) /= 0d0 .OR. coefficients(i_loc, 2) /= 0d0) then + counter = counter + 1 + coefficients_temp(counter, 1) = coefficients(i_loc, 1) + coefficients_temp(counter, 2) = coefficients(i_loc, 2) + end if + end do - ! Skip - if (k_loc == i_loc) cycle + ! Call compute_extra_roots only on the non-zero eigenvalues + ! This allocates real_roots_added/imag_roots_added with the original + extra roots + call compute_extra_roots(coefficients_temp(:, 1), coefficients_temp(:, 2), & + real_roots_added, imag_roots_added) + + ! total number of non-zero roots after adding extras + total_extra = size(real_roots_added) - numerical_order + + ! Resize coefficients to hold non-zero roots (with extras) + zero roots at end + deallocate(coefficients) + allocate(coefficients(size(real_roots_added) + (poly_order + 1 - numerical_order), 2)) + coefficients = 0d0 + + ! Create perturbed copy for leja ordering + allocate(perturbed_real(size(real_roots_added))) + allocate(perturbed_imag(size(real_roots_added))) + perturbed_real = real_roots_added + perturbed_imag = imag_roots_added + + ! Perturb the extra roots so they have unique values for the leja sort + counter = numerical_order + 1 + do i_loc = 1, numerical_order + k_loc = 0 + do j_loc = counter, size(real_roots_added) + ! Check if this extra root matches the original + if (real_roots_added(j_loc) == coefficients_temp(i_loc, 1) .AND. & + abs(imag_roots_added(j_loc)) == abs(coefficients_temp(i_loc, 2))) then + k_loc = k_loc + 1 + perturbed_real(j_loc) = real_roots_added(j_loc) + k_loc * 5e-8 + end if + end do + end do - c = coefficients(i_loc, 1) - d = coefficients(i_loc, 2) + ! Leja order only the non-zero eigenvalues (with extras) + call modified_leja(perturbed_real, perturbed_imag, indices) - ! Skips eigenvalues that are numerically zero - if (abs(c) < 1e-12) cycle - if (c**2 + d**2 < 1e-12) cycle + ! Reorder the (non-perturbed) roots using the leja ordering + coefficients(1:size(real_roots_added), 1) = real_roots_added(indices) + coefficients(1:size(real_roots_added), 2) = imag_roots_added(indices) - ! theta_k/theta_i - div_real = (a * c + b * d)/(c**2 + d**2) - div_imag = (b * c - a * d)/(c**2 + d**2) + ! Zero eigenvalues are already zero at the end from the coefficients = 0d0 above - ! |1 - theta_k/theta_i| - div_mag = sqrt((1 - div_real)**2 + div_imag**2) + ! Cleanup + deallocate(coefficients_temp, real_roots_added, imag_roots_added) + deallocate(perturbed_real, perturbed_imag, indices) - ! Pof is about to overflow, store the exponent and - ! reset pof back to one - ! We can hit this for very high order polynomials, where we have to - ! add more roots than 22 (ie pof > 1e308) - if (log10(pof(k_loc)) + log10(div_mag) > 307) then - overflow(k_loc) = overflow(k_loc) + int(log10(pof(k_loc))) - pof(k_loc) = 1 - end if + else - ! Product - pof(k_loc) = pof(k_loc) * div_mag + ! No root adding - just leja order the non-zero eigenvalues + ! and put zeros at the end + if (numerical_order > 0) then + ! Extract non-zero eigenvalues + allocate(coefficients_temp(numerical_order, 2)) + counter = 0 + do i_loc = 1, poly_order + 1 + if (coefficients(i_loc, 1) /= 0d0 .OR. coefficients(i_loc, 2) /= 0d0) then + counter = counter + 1 + coefficients_temp(counter, 1) = coefficients(i_loc, 1) + coefficients_temp(counter, 2) = coefficients(i_loc, 2) + end if end do - ! If pof > 1e4, we add an extra root, plus one extra for every 1e14 - if (log10(pof(k_loc)) > 4 .OR. overflow(k_loc) /= 0) then - - ! if real extra_pair_roots counts each distinct real root we're adding - ! if imaginary it only counts a pair as one - extra_pair_roots(k_loc) = ceiling((log10(pof(k_loc)) + overflow(k_loc) - 4.0)/14.0) - total_extra = total_extra + extra_pair_roots(k_loc) + ! Leja order the non-zero eigenvalues + call modified_leja(coefficients_temp(:, 1), coefficients_temp(:, 2), indices) - ! If imaginary, the pof is the same for the conjugate, let's just set it to -1 - if (b > 0) then - ! We know the positive imaginary value is first, so the conjugate follows it - pof(k_loc+1) = -1 - ! We need the conjugates as well - total_extra = total_extra + extra_pair_roots(k_loc) + ! Reorder and put zeros at the end + coefficients = 0d0 + coefficients(1:numerical_order, 1) = coefficients_temp(indices, 1) + coefficients(1:numerical_order, 2) = coefficients_temp(indices, 2) - end if - end if - end do + deallocate(coefficients_temp, indices) - ! If we have extra roots we need to resize the coefficients storage - if (total_extra > 0) then - allocate(coefficients_temp(size(coefficients, 1), size(coefficients, 2))) - coefficients_temp(1:size(coefficients, 1), 1:size(coefficients, 2)) = coefficients - deallocate(coefficients) - allocate(coefficients(size(coefficients_temp, 1) + total_extra, 2)) - coefficients = 0 - coefficients(1:size(coefficients_temp, 1), :) = coefficients_temp - deallocate(coefficients_temp) end if - end if - ! Take a copy of the existing roots - coefficients_temp = coefficients - - if (add_roots) then - - ! Add the extra copies of roots, ensuring conjugate pairs we add - ! are next to each other - counter = size(extra_pair_roots)+1 - do i_loc = 1, size(extra_pair_roots) - - ! For each extra root pair to add - do j_loc = 1, extra_pair_roots(i_loc) - - coefficients(counter, :) = coefficients(i_loc, :) - ! Add in the conjugate - if (coefficients(i_loc, 2) > 0) then - coefficients(counter+1, 1) = coefficients(i_loc, 1) - coefficients(counter+1, 2) = -coefficients(i_loc, 2) - end if - - ! Store a perturbed root so we have unique values for the leja sort below - ! Just peturbing the real value - coefficients_temp(counter, 1) = coefficients(i_loc, 1) + j_loc * 5e-8 - coefficients_temp(counter, 2) = coefficients(i_loc, 2) - ! Add in the conjugate - if (coefficients(i_loc, 2) > 0) then - coefficients_temp(counter+1, 1) = coefficients(i_loc, 1) + j_loc * 5e-8 - coefficients_temp(counter+1, 2) = -coefficients(i_loc, 2) - end if - - counter = counter + 1 - if (coefficients(i_loc, 2) > 0) counter = counter + 1 - end do - end do end if - ! ~~~~~~~~~~~~~~ - ! Now compute a modified leja ordering for stability - ! ~~~~~~~~~~~~~~ - ! Called with the peturbed extra roots - call modified_leja(coefficients_temp(:,1), coefficients_temp(:,2), indices) - - ! Reorder the (non-peturbed) roots - coefficients(:,1) = coefficients(indices,1) - coefficients(:,2) = coefficients(indices,2) + ! print *, "after root adding and leja coefficients r", coefficients(:, 1) + ! print *, "after root adding and leja coefficients c", coefficients(:, 2) ! Cleanup - deallocate(coefficients_temp) do i_loc = 1, subspace_size+1 call VecDestroy(V_n(i_loc), ierr) end do call VecDestroy(w_j, ierr) + end subroutine calculate_gmres_polynomial_roots_newton @@ -1209,6 +1439,11 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! If real if (coefficients(term,2) == 0d0) then + if (abs(coefficients(term,1)) < 1e-12) then + term = term + 1 + cycle + end if + !print *, "REAL CASE assembly", term ! ~~~~~~~~~~~ @@ -1218,9 +1453,8 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Also we skip the first one if we're real as that value has already been added to the ! matrix by the build_gmres_polynomial_newton_inverse_full (as we had to build the product up ! to that order) - ! ~~~~~~~~~~~ - if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12 .AND. & - status_output(term, 1) /= 1) then + ! ~~~~~~~~~~~ + if (ncols /= 0 .AND. status_output(term, 1) /= 1) then !print *, "ADDING IN REAL TERM ", term call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & @@ -1250,6 +1484,11 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! If complex else + if (coefficients(term,1)**2 + coefficients(term,2)**2 < 1e-12) then + term = term + 2 + cycle + end if + !print *, "COMPLEX CASE assembly", term square_sum = 1d0/(coefficients(term,1)**2 + coefficients(term,2)**2) @@ -1277,7 +1516,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp end do ! This is the p = p + 1/(a^2 + b^2) * temp - if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12) then + if (ncols /= 0) then call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & square_sum * temp(1:ncols), ADD_VALUES, ierr) end if @@ -1378,8 +1617,6 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp 1d0/coefficients(term, 1) * vals_power_temp(1:ncols), ADD_VALUES, ierr) end if end if - - ! Delete our symbolic do j_loc = 1, ncols if (associated(symbolic_ones(j_loc)%ptr)) then deallocate(symbolic_ones(j_loc)%ptr) diff --git a/tests/Makefile b/tests/Makefile index 47a11c3..797edf9 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -134,7 +134,7 @@ run_tests_load_serial: echo "--- Testing order = $$order ---"; \ ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \ done - @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders and fixed sparsity in parallel" + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders and fixed sparsity" @for order in 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ for sparsity in $$(seq 1 $$(($$order - 1))); do \ @@ -189,12 +189,15 @@ run_tests_load_parallel: @for order in 0 1 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \ - done - @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders first order fixed sparsity in parallel" + done + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders and fixed sparsity in parallel" @for order in 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ - $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order; \ - done + for sparsity in $$(seq 1 $$(($$order - 1))); do \ + echo " --- Testing sparsity order = $$sparsity ---"; \ + $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$sparsity; \ + done; \ + done # ~~~~~~~~~~~ # ~~~~~~~~~~~ @@ -416,7 +419,24 @@ run_tests_no_load_serial: -pc_air_improve_w_its 3 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power @echo "Test improving Z with PC regenerated with no sparsity change with 1 iteration" ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -pc_air_one_point_classical_prolong 0 \ - -pc_air_improve_w_its 1 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power + -pc_air_improve_w_its 1 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power +# + @echo "" + @echo "Test Newton AIRG on advection for for different orders" + @for order in 0 1 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ + -pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 5; \ + done + @echo "Test Newton AIRG on advection for for different orders and fixed sparsity" + @for order in 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + for sparsity in $$(seq 1 $$(($$order - 1))); do \ + echo " --- Testing sparsity order = $$sparsity ---"; \ + ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ + -pc_air_inverse_sparsity_order $$sparsity -ksp_norm_type unpreconditioned -ksp_max_it 5; \ + done; \ + done # # ~~~~~~~~~~~~~~~~~~~~~~~ # Include kokkos examples @@ -614,6 +634,23 @@ run_tests_no_load_parallel: @echo "Test improving W with PC regenerated with no sparsity change in parallel" $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -pc_air_one_point_classical_prolong 0 \ -pc_air_improve_w_its 3 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power +# + @echo "" + @echo "Test Newton AIRG on advection for for different orders in parallel" + @for order in 0 1 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + $(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ + -pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 5; \ + done + @echo "Test Newton AIRG on advection for for different orders and fixed sparsity in parallel" + @for order in 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + for sparsity in $$(seq 1 $$(($$order - 1))); do \ + echo " --- Testing sparsity order = $$sparsity ---"; \ + $(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ + -pc_air_inverse_sparsity_order $$sparsity -ksp_norm_type unpreconditioned -ksp_max_it 5; \ + done; \ + done # # ~~~~~~~~~~~~~~~~~~~~~~~ From 36750055281c171b56c52ced1224e91ce6d4e19a Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 22:13:50 +0000 Subject: [PATCH 21/41] Fix bug in Newton where too little memorywas being allocated for the coefficients --- src/Approx_Inverse_Setup.F90 | 7 ++++++- tests/Makefile | 12 +++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/Approx_Inverse_Setup.F90 b/src/Approx_Inverse_Setup.F90 index f83f6e4..293f4e0 100644 --- a/src/Approx_Inverse_Setup.F90 +++ b/src/Approx_Inverse_Setup.F90 @@ -70,7 +70,12 @@ subroutine calculate_and_build_approximate_inverse(matrix, inverse_type, & allocate(coefficients(poly_order + 1, 1)) end if else - coefficients => coefficients_stack + if (inverse_type == PFLAREINV_NEWTON .OR. inverse_type == PFLAREINV_NEWTON_NO_EXTRA) then + ! Newton basis needs storage for real and imaginary roots + allocate(coefficients(poly_order + 1, 2)) + else + coefficients => coefficients_stack + end if end if ! This is diabolical - In petsc 3.22, they changed the way to test for diff --git a/tests/Makefile b/tests/Makefile index 797edf9..33391a6 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -93,6 +93,8 @@ run_tests_load_serial: ./ex6 -f data/mat_stream_2364 -pc_type pflareinv -ksp_max_it 21 @echo "Test single level GMRES polynomial preconditioning for hyperbolic streaming problem in C" ./ex6 -f data/mat_stream_2364 -pc_type pflareinv -pc_pflareinv_type power -ksp_max_it 21 + @echo "Test single level Newton GMRES polynomial preconditioning for hyperbolic streaming problem in C" + ./ex6 -f data/mat_stream_2364 -pc_type pflareinv -pc_pflareinv_type newton -ksp_max_it 21 # @echo "" @echo "Test single level GMRES polynomial preconditioning with the Newton basis matrix-free for hyperbolic streaming problem in C" @@ -320,8 +322,8 @@ run_tests_no_load_serial: ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC regenerated with no sparsity change" ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 -# @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change" -# ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 + @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 # @echo "" @echo "Test AIRG Newton with GMRES polynomials with PC reused with no sparsity change with 0th order fixed sparsity" @@ -593,9 +595,9 @@ run_tests_no_load_parallel: @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC regenerated with no sparsity change in parallel" $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 \ -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -# @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change in parallel" -# $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 \ -# -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton # @echo "" @echo "Test AIRG Newton with GMRES polynomials with PC reused with no sparsity change in parallel with 0th order fixed sparsity" From 88d2c3997f7907978e40773c8ed8dbdae3478d14 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 22:16:26 +0000 Subject: [PATCH 22/41] Add newton test for indefinite problem --- tests/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/Makefile b/tests/Makefile index 33391a6..534f1b4 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -123,6 +123,8 @@ run_tests_load_serial: @echo "" @echo "Test AIRG with GMRES polynomials in indefinite problem with zero diagonals" ./ex6 -f data/e05r0100_petsc -b_in_f 0 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 26 + @echo "Test AIRG with Newton GMRES polynomials in indefinite problem with zero diagonals" + ./ex6 -f data/e05r0100_petsc -b_in_f 0 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 26 # @echo "" @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 2nd order" From 54e02ff088d59542af94c8ac1dcfb628bd690416 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 22:36:48 +0000 Subject: [PATCH 23/41] Added 1138_bus test from matrix market and Loe's paper for Newton GMRES polynomial, both matrix-free and assembled --- tests/Makefile | 6 ++++++ tests/data/1138_bus | Bin 0 -> 35720 bytes tests/data/1138_bus.info | 1 + 3 files changed, 7 insertions(+) create mode 100644 tests/data/1138_bus create mode 100644 tests/data/1138_bus.info diff --git a/tests/Makefile b/tests/Makefile index 534f1b4..355816e 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -146,6 +146,12 @@ run_tests_load_serial: ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$sparsity; \ done; \ done +# + @echo "" + @echo "Test Newton GMRES polynomials matrix-free with added roots in market matrix problem 1138" + ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 60 -pc_pflareinv_matrix_free -ksp_norm_type unpreconditioned -ksp_max_it 6 + @echo "Test Newton GMRES polynomials with fixed sparsity with added roots in market matrix problem 1138" + ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 60 -ksp_norm_type unpreconditioned -ksp_max_it 6 # ~~~~~~~~~~~ # ~~~~~~~~~~~ diff --git a/tests/data/1138_bus b/tests/data/1138_bus new file mode 100644 index 0000000000000000000000000000000000000000..dc7455355f0809361de6af65a2a559a98a7ee368 GIT binary patch literal 35720 zcmb`vby!wg*Y-_WU{ES1D4}2>f{hG96f8ioMKQ6vySux)-Nv@vZW)T*-5uM`ZRdB6 zwdTB@>-D~$_xt|%JdU5|>^aw5^IB`hSlGDskF&9H=w{;RV`F1uuSgFS>_9q&161NA zOx(6sen~5SYb1^2wGI=vF4H<(mn(6)4dT{yX_Yu#hK|>&^I4}6w{ELW_uo{flQ_AS z-~AG<%awR3Q_$hsufud5k|&EwC+!x$#7V!1W>s$S>%98^>pC0@5lUyshC%M<-1W6Sz2{^>v(bNHtKRjb)F1x zOBy{^^|iSmOt(eiH9B7W*7r)6Cw1xitkX)o4wt+-PQtWX+MwGe;gU}KbsFv0YMti) z7G@nMd93qU`^By6)@ig#8P;jVuhagwIO{sK+gf#5|4ntd5~r_S_qR@OokqfR8zfxv z=zP|0omYotQu%aV3D<6G6}MoWUWZw$^*z+#GRCx9)cTqvPPbR5(ec)Pam)4VFj2`T z=x}{a)~eI#d^*m$PMu$;vCgCWOQ-q0eG%}YZbr5TZfC=x*sH7$`$l@5Vy2TtFFVkOo`Lu zL0^M}$$L-yx^Bs%<3)8@I$px8(ea{^M#|JkS-R~R;nry-Uc0U95Vv)lP9v(*NxW{0 zz7M+1x?UZx{gVH`l`CaRT2brrby|Hrx*d{6$LaJsueJVOpKgcFYaOrSbUVba%d)Ol zhyAWPzb;$*bscigwOgmrerpxClr3tFgjwg4`}lj9Zo75(?`26D5~lB&?py10;?{m^ z)#29qdmAOrx}Cbd|LwYTex1%bzb?nRK5^?j)~c_=I!xTw<%wU{W34)mbv-(*E)>9t#{q|t84tNs5iOqZwASeLESYt?buFL^W)W{vgS z!rCoq^fOxAx=dZagy}M^uUn_paXOvk|2<6me^*_GsLn5Gti!GISeLES>3H34QSG)a z$2v^Ht*=F%FFIbA_j~!);X19Tbvo;3pS~VZjimiOk1p3bjdeQf@^xD4FkR38*Es8R zXpZ!QT@X~lG5Ub*(GH=ly4xjT^$; zfwWE9E$-|f2gnIL{)=C=(YW3`h|df10l5}ye}4EqK?^JOQhwE5yTb6RzO*|8w>Pfq zFjTh(*JA-2YNQ>i`ue{azYKzBOPk*r&qEKr4Z=ARLqfl|gwB0V;q;Mw4!=m8UaSh-;3M5SG7q){ml!g9uK;JpYwMw~F6jh;p zfT)xo3#x${pcd!_B z%!n6vD%cOsATAy>0SQWLLz6%qP!|jYiOT&3`W1XfUPpl6f-QcBwgrI?@CCT%w$lHS zx1ORtGzD}6(pL=?jiBv;gm(wxZ>*3ueM4Cp0DfP#7~8hqK-Yu&V~cxXi+f;;ez%oA zmpo13mOPZMv9oc7K>|8%c*g zvfBcM+g|R2?z0|n4+Nq;fs7lWALtEaj0t^F<|k8)xc+cUngKxEqB5pLrMw}iufYGf zjkv+^3y3p*Pa~ik0Q$-J{S1dn*k~Z`$tdHI3L6O(MgfT*hqwpIJpnq#46}jT?latD z0o?W-fLuS)+l)7{uZS?*Ux~Mwi0iTi($ABC)cFs1f;8n!zrBn{+;15ZcK96_&#Gxq zVY*@lbS04A*#CY1;~=tDw@h{mPGVY^1#g)p4ugmUc_IQvM#W z7W@Sy-#Q@U=_-)6$u-YNG^sEoh!;0loX{sf!AW*~W`AH*#x{!2j0lXS;{u74Zc z(s$AyqS7Zfm0RkO>y$ndHv{R<{tfqGAoutHI0)n( zNxbxt#7Q|f00W7WuromVM6OfvN%}M(`7Z#S?g-q{)?-SgEOE=V3MYWRHff`jCw(L3 z-%%>XUi|T)Xi*)qN@Hq#Q}7(_e>MuHlhV=}X)@o4Z!}M2#^$rcl3w ztx6+(E#pdlf5JoX50J4ZZYfv$B@AWRJO$6uwjk&;({1w({wyFH@CN}ZO+M%g_#Hrg z5DYq^J#v3uf>-b(pH^EBxL+g84*CYX1s}k7gh6aG0auU%c!7ez8~6ZUfO>610D)*5 zGylKP_W*6Qfm_-rerczaE$MAOf=}S5m3~HCF8IGdA-3s(7pMr&SEwWO8&v8m3pd(m z^8=s^+q_DV(bgUyugx!L0f78Q`w(P2ucU9#R=H-|>_F0c0SQkBoPZlB2c!)dKxvQ> zNO%z-{et^pyo=Fh+dQBcki6($+oB*dkZ|$qKFkSsCUndn)%s%EJAVJ~QS5-cYGe(qbGK_umsL_sk1OU;KXm{17JNrw}Ntkb5d& zqPSMOolv7P;{0!W+XzDYk;29jU$;XWGWin{`+ z1SGwkqgHFob2r5XJAng;9Ku6FN zGy}829OZ5Y9SD+@4uUoZy+8{f{)wO^7z@UMR^T>h4X%MU=-=be6d>VuKwB^k;m4p- zM|;o#^Z-4Znr|l>R&*vd>Ui+AB0K1$4Z|-QHJePrMgUM zi@FBe7sxAp{Stfzui&?VzJ_LmzJX?fzJ)qM-+_P8Z$V(H>9&0j{QxBYS2LfTD>SDW zZkGgiGtd%w)*$SoN{4#v3d8*gNM0ay+GPRZ@M8=~dA8rccZ8u~Mm*YMSI`W%YX|or zl$+U1WBXI3%K*)eupdykC0yETzYy+UW|&;3oh`^?sy))%=Z0JAu?G$y9l&)+eLBqk zF4Emsap~dS3l76CbxQlCeFFN+pfci7k3IbM1rX*0+~K|op1_T^+m!)w-C2S39n^jf zSb_O&5fBJUf)AJMl;3EZ^f#U-_PaouN`t<( z$M4Gy?UUb^kw?@ENZ*KC;sqTh*WnHPfDe#s_5~q8!U}-^wKtF(inB( zUK!uhd%?f0#wFFMW~EfT#sEP5Cy`3 zj58S<1jTrevF0FS0?3#!%8G;=<4nr29|RU59AS2`0Od$MQnoRE@*@moTE>*zhbo{d z%9Z|t8o%w9&}kqB)B#dfB4`DwfdnAo)j>^A14vjK<*yH|1)766&T)i ztr92glRT1Fu34^6{Bj*BNGI`aK@Ttpv;*xGy`dOacAb&V2ig&I0YgA1AnAt!#M=!7 z64wuO1$~vf8&uMD2fcu#m%0XkVW20FaO5+_hq$F)$uIRtpXjps!wnqd{z`tSQ_7b9 zlC)B`lsO!X2Q$G4Fcx{dp(DX;xN}2CL7kza!AiI>_Uy)h#c=0`B9Gk!Fb*t4m^)O; zl(1P~E|?FNBQ75l;SM>W6Tu3&y`Yo86fg%&2B=5;M)_0WUIb8|F%FT(ZaU&}fdX*P z01_tGAZesdX}c#>%3BMh4Qs$Wgv&KadlrCIstjp&Gx((*gjt@M$m`$%q^>*w^%?1w zAk9*+4F2p&k;b@2SE#f{`U?H#P|$Q6<*tT52asz7`TIaBV6Yz? zP+WqhfrE-e(8J&YxCM@YW8kP#aUTZ~Cwc;$RGflJzSBz2C_St6oYM13CGABZWr<2# zE`uxJx|Lpq`VD)BeKO(6MigMYvs@Ce*h{(H)QANm+P01s7|_{IGKd{kjlhVT?f z{J-Fd@=Mq=@Em+l`W`Cz-+-6k6_Bz-UjvE%0wn&e@_&cE1K)s@FKGlRQ}TWSpA|o# zKY^6-RU!GL42k;%BpzyS1Mr`rw$ReRPK8Oh14svQ1B_pJ{}}%C&NyRb+!oUCzJ-xB*qZ5som2HVBh)B)`PF1Eg_i zZ@T3GjoQ~l!5 z1#U^_t&sFx;r0c=zz_5Sg}@L{7|aI#AOOr%S_C=)6a_uNdX$q4Ee47MiIcSAE&&3; zBBdpveTTl@Yhz6m`T?JYR zR0SH4BNgE5oK{X&{M1bm`35WzWfaKHlG=#emkUFGnDObvqa-yT8b2C z9FY4e_oyj|2X%qOC4hQB?ptk8A4r{bKzkthB(0>8I7uh@bXyw3El8WCd_nR{JnN6-RDSZg5R;%=qTVWMq-j2Ed_RQgKtw*%5f2@{nvB)pr_PEd)H`zd2k z+=9e+2WxDMd5m$dr@}8|rzaQ$76HT?<8m-`2#|3yAM^n;Kwr=gj8}ef_Xh*OFd$`! zdmxwtR--<-K9uhu*Q(2vvZnwka}yW}q&*9PlrtPi9ik%u(mF_Aj09uBC~zH&24lcF zAmxZ#;^zVh8wchADQhy2`lO8$z(mC)sKg1yZ;2}cjI2m*pRa1TBJXueY?R1rIK$WkiHW(0|r~bK_F#q1v|kWuowId(ty;z z9qa%S_m^@@zN6qYkbJ3N7mzek_dey8^t!ymaGwNHw!}*w@k?9w1Gx`Jl>3nKpMXj` zq&|t0IwkHnkn(icF}MW@mpp=$eF|I#(#|vBEVv1tDEB$#KM$337r;eu30wh>lw0Di zsj#ci>p;qpFme9_Zh*VsK9IOuirdgT3W>j`kaQ1$)bT*MwO_)d%*R05`V`~I4*Cr0 z2z?IC0DS??3VjJ)f%o7m_>S;&Dy@XS20y_Y@D_XlDxLigq|2tl-zoRMN~JvHb8rAs zUS^O5NSz;)euN_4{uB5GcP8j(6{hNO$cV6S0PQkBokn{ky$z6SQ+W-)J;H3waEWse zKd#$Q-Inz53+M-_)4>U#9>XnZ#V=U3S+3KVue(59K`!8?G%wU0XiJ^%3lF0?JNx%1BsV1 zByV{j?bT_dT=7eO0`Z_82nP|0C}?dU?H6|ps0^gODxeyO19C0muL%->lv5Q*yyS^h zh`+j`DYOQVw34qDXabUelqv101Ef4@gSZnx15g)8I>{?_OISnD$V%n1ubEkGyG60`!XK^u?)B)lzX2PA!a&;fJ=5+`9@Kv$aWl<&`CS?hAajEB}_Z_<== zyeN66SKhy=VC7H48|)2cr?7rC`|Yf236F16_u0{A^|-)d`wj~nCvsRJch_`#?KsdS zZ^!)36WFbvXSMwIBe_7Q$jWD4KBaiS-Ul21mB!(Ri1F(@heAWADs2V)rJ(?JY;&)wC(KXQEkbSh#sj=?iA@!(D`KQ+ki)V zN8c!tdhhGdE`K~pN0lNRI`oRY%URRqOs+h$bn2a}J@4Nr>cr)3e-*F#WHsgeb9vRy z4)arAe0`U7<@S=PADbMn=32v%a$KJH?efYQ7Jr2j(?ib3JWT!f+}3xF-9K@@4P7FW zz9zGG=WR`M{yiZ~g;U;XE+_04y-oeGwAZZ-ukLYn|7~t_T9hOICti6keObUtY2NVk zKDSEb;Yu40d@i$X8CM!`x97(n1-VkILXQg6YirRt^GjD=dVD{rbfnUeN@vj>-8?I_ zY?zOCB*gT&_~ShXg^h99a;*p#DLN`$O8!%{qfNS{$9XurpPw}CQp57>o*JB`#^J{7 zzI$ti!cU8_`>IjN8@uc$6;CRjR6OZ;mjPizJ!{3MJ~*}Lb>y4eR4}Z}x7TMgu-y-z zMNwbVvHfsQ`*W4ArvB?Re_eK$yPU4@hl=$Udy|SM6(6SJ!@k+Q*!Zl)&eVtYR}TBS z-bsCH^Xob#T(*R3|D$Xl9Iy4hoT`+n7mgYbcp(G(RJJeqVD$|SXs|Oi|DhmzYEe!bgZF@q4Jozw>twG#sr2NGsdr}da~zW^j1p42)OsJBn@cosd*UAV zhO79#EIi?{1LtV#P;5d#WeRL}Gt-biBT^q9rxVKpnp3C0?->2vbY)bVviSyawZ-R; z?3!|!`4j~I1eqA(cOi90|eYr`R)7{(W$8sBIb38XszPEYol0u}@h1RaKc*e*iR!WU? z+HczV)R~o1iRvS^-#i?~N~vre@w&58YUsGWbiZs#p7L4WGMBvbp{69#ep%CnbMopF%MR-Rq6)xM=nwHKw&_OKtT>*R{lVk(Y(??^3Q zrvEr2x`*X=U3|&jeaUBjQ8)L^@B07wi>sVC_Ugp9985)Ab8p*Olw-X7V^+2rLT#?P z_q$hRBe!~$N&1~S1cu3jmfgm6}LX>S)P=i`ef}={@b9b z9OWNdqezjTT(0ZSQNt@XuIDQf@PIukG?qC1~WDOF_v4^707( z*m|o+9bnf^5AS!ZYR4HpHY_f2V+u9zuy|juquaPrhRseV{>jG?#mDb&d!T})JzH00 z*oZ6St?KvKrQNRh=OaD2?w;bK&mY;%4QOS*J{=0u(B(~~1G#FZtHbvMM_IJ@%)RXf zeYrs+#wHr=h&xiFU5@;ei=znycY79ka@WRPZ@2v; zl9QEIHQUS8&2eMVk*c1~=J=*d&iI>ZcYEX?M3zy#YWlKq}kp6iW zKf7ZT7gD(ShdGOw!Ybhh<4hw8ciZfBZ~?dMaV^K_B6b}25KoiLAzZdpu@C9ae&t>X zG4aFJPNy!#E1U{hQ=jXd3C(b{S`rN}&~L-FCzaUuO#24|pO)ul-9|NtegBsI84*&%XW~X8v9*?a}QbHyH+fIlTSLuaQ5Sa9MRiekEf9>IpuP{9F@8U-5uHy&0jcePP*H_zTL5;OGVeB>ag;U+0 z_1U}LipT4Fw&c9iVsd|XIL7qu+n_1GW>ebq?#A;q??+5u>JDSyokinkFN?7}2MUMY z>g2!;IP0dv&TpP(rs}iy$$+IyuG1iugv$}2Sr?NxyQ_Bb1~L~eG;LDmg$ud2`Mx3LC*5yKY4WI{GGX8eDSP~n^j%E{ z!%H+S{2|S9J%>8CGM;x)FDiU0N^C4sB+o)M4CAkAS%)Iox^f#%Z+*l74?FrLGQ(UZiem#PPdwH@jH3IL8cIaAIJ1 z4{DM9sl0eNs#1wRI&2T&sI}oo3v@`PM&Iz79rTqW*5vQo=J+$J-~G+Y(2)z7>e=pU z{jh@N`PggQwDSH(`f;852Tn)FOyT&y%y`V&=6&%rQ?qkN)f3mn)mUbDEH@h#j6_oNzM3)`T@t zJSNlCKh`g<%Mm$7j-GyN4mF^4Ef0Jy%Pz})?D}#4HHWl4x$|PqD4MV%v|?$OVD|Sa zy!6g)%rBLu&Gs;!tNMBF`m=hxOS=h{`%(Y>%_c3zwC2d%rCz_CX-|!8zdi0SK1#p; zxI(KxFJ82(#$NSoXI*{&h1yp%p1YQLOpP}iGUA2L%?f0kXV=r0>W&DiKD>22`)~W2 z&b`7XicY(<=TDCu9JJJ?(lf95T-2kHN1bsCsNTxCm3O4|qKJm}2RAR8%OMj(M=qE! zor~9)kzs6dC|wx8(s=LsoBhl-yDwf1&XiZTTPvka>G6O3p3HV5age88)|HpGkR7#)kAZOo4><5azeEc0P=UQ7we_pbMMD#uDGV9v01Yo155 zQt~;2+t+73E2VJt{!mIqI>&FA_cK2$)$={wf7CpmyaubXQi`nLwy)r$+pLr-;_ZKK zXHHg1-c>h#3)tw-uFoG`dzg6yyRGb#w`;A_6w(xLVZWj5lg=x{ttdD4+fyRl>Wr_* z_jK2Bqpo;b(tB+F{y9rp8A`-dyi&2|T*Yxw9^5%D*NP;JIgJ?uM_y^tXK0@MdYz!x6V#-G@qFN#HLxt( z_K{20`FoFJLReB7C<0H%bsAnSH%3h5bD`fe{#nik}&wIjEK9!&1QyRCq^B{l= z9vSw*wRJGp%zx$ZxAQ%y-11Jvw;U|RMMFA&cWLsGD#bLahc_OVKGHd$Ve~@IpJCIp z(2qAM*2{eVMb4kU=Vqp%9N^aIlIx2O6t`nqU(EEma`fp}*(z>hw?TBYO3|!b@KVkd z$=!ER+ru(V_Sf$-ii!8GxqE4E_8%tG?s$%PgXKs17J$iitgcvuraM>Y^p^m4(7ZiYFkM5@xz92@cOPjCl#1QUIj9S z?)5IoN~!6D+vhP7IO@>jsdlLOB9WR;D5a?%&2@!u=MDM%Z+7EigBQz#n5Q=Gy7t@S zjvSEJ^^c){Jg2dTywlb8^WYJe5{>mzt(8%3iB%eMv7|;3l|O8xPIt@IcWz$OGAG5vdR`C`+Y%Qj9SHSeZ~O2el_ zTutMUzsoFMJIIZu9XKNkKpGp^VSMDwp;Xfe%b9Fnxbo{ShXe97=D;$>yop-Pl5LQZ zdYn`6>v5Ri+p_E3-Y2=tbP612?4OXDS5w8^Pbx(;e8qW_+7@z3e~^M6{(7Dvwm18< zx-hr%ni4dAY>5|jFP-H?k1KeAjne%{NvXzrmFje?ob|`CL$ttct1A*GO7zJ7KFO&j2R~o=p!TOTRJl&4{e$+rrLN1({TTiHi|aAV?LwaUmUWdX zkD8Bg>SG^$lPly}U#;7rTr@vax*ru}=s^6%7%gWv#xq@1+>)*5QtBmrXB^6G(yc?P5XQ<;BR0$$ah=Y&W6r$JXBZ zy{VPzPLl1QVD>887YmLSlyocHs}bks^b;aKe92+UL0>kNcRziQWoAz!; z`(*N5ztMi{)77k$R!zxd{EpkG=S$rLV_nNhw`97Jj_dip_&LA-JBvTy9)6YO*KpXr zZ|A#T`pdHZQ~Moy-@~%*(d(ap$-OtVOkbV@Pme6MB|{Ic>$$UigZ9%(%=Nxr zAL;eE>PKp_7_Yw}3%TZo11sv>9!&MVXK32_ycd`0+u~-`b{D8|jQPAQbL&_>_XmH| zu+sk1o$lV@E)Q$_<=nrIyQ}yA;Erbhc67P%e)RXSk{~XF`THeYq;RFgJUc@HcvpQVw8xmpGq#f5$GwfRWtJ z(VXAu_uqtWS@Ph?%VFK*HhEaqld*fB&&%2I2BjSzA_ItH&F7Pz?{FP+9Z%}MQI{su zZ_WN#gqx+GQtWP)JeK(2>sw@_i~aA6mkls#a|hG?&r9|7P_t(H>l_>ThFU+sbP4-W z>N&`@NUF8r#1ebP_^q2v%|<wW)m|1PcmB7(h!KlS3aPxL+$hZkSs+F;RD4s3G0P~qkuNv&Hce)*q24|Qxp zGb%O-9PfQ{gI}%9pO|;4!@?O_^u(2W`Fm9?SD+PBh;O zdOv_FOzP$JYN8A4=RuTBpNPQkpGoh3>EGXinnwN9?jn}y^H*_#npbqawPMZwI>)I? z>x4SZ*0m!2JW#)XYI+;Xz*XBV>yT|Tyre;-#sy6rzH=L%oHVhGv2LV})m#?b?fJ=~ zdVQ`G>kMQ6#j+36Wd4#bncDB+lIFUd64n~?RH}T$)TFtXc4I$Bt&gZh*`wR=rq$1T z8uPs|w(nd??-#0mrJ2)eV!Hhh9@lZ!L4Uxw95?avhR4%@?{=5U4U&+|qWT}e%*8~eDF z*yNwQc=J=MP-DN4^txs76j??WA^YezXYSRBWTjy*N-lOUdTUdv(y*`V);stX-k7SC z9PYRdP8#u+m4N%03PoH4VAAd5%G)O5rY4&1v!1xONi@?-TmH z#>I{=^lX-M4h7%a(HSQztbV@~w-Md{{1xZeI$bte*tf%92X@z(!OpibI<=`-%kmyB z9vi)|d&Da)_-24t=4^IUG55Gb`@_z0$(BVcE_%UKetD6^*Pgezbo-}xA+)CkOU?B` z(IQcgGAC{2;BCSC*KBBPDX&_L%|Dw?-q-sERQqtjzh11{ZJ8(5c{IokCF=cOs#x2Y z?@_+Ni=!@9Zo^8c*o6wty`#r)_QI9jN{s%4GxTlg*)pas=iWbfMAX7mDw%oIAH`0% zbJj5@VqZ*b%$|q348zV9Rs1?G-?!=G*yCjTsx^A8*XtOvjW^~stdxobV%uWDM9!XU z*SDVizwBkso5S8e+x=(grJAYtLtodtaxo?K@$3=VmtQVLRm}GxE2W@`Z}+`l@PU<* zS8j9sDW$LKt~M8Z9Dlm-%ceY7jsMcUjpq>ixs@q!sp4(+yU}sfi<~d0en8T# zbp9b+uzJHszn1U$Up2h{DA}lCw{3~5&xa3X-w}0-XZkssbF}I|;>LpRr1D!d>NR$; z$XU-hu@VO_As6wY;*g1YuDjZNag!o!&Ls? zm5Nq<`KVgeiTuh(@07nt(}JPf zjeP0uOjq?jq_2}oOo+nXg-aJ9$JC0^AxBcVe0{-Gcwaont+PR<;*d+)4|M*f(8x6!s<93++^+}mHC{!Dof=Sx?Ij&c+c?nFIKbDk$D?C zOzuey>@MO#^pe!?pM1I-zZ*)sw9nW-RnK#-lQ}*zwOdzCxPZs+>3MYc)iYz>RkgeM z{EE-{YFVarE4b=ibDyu#OYGA3t;7YdEqna&eJVSqAHVp<`LC%DH+ONYlj9ou7sd8? zrDs(3;Jrh`5;s%#H|G4dLyCFsrPlxdYk03E=5t=}6SO}fucdzUzlBFG%_zy&feMPAZ&+ecEKKZ-=P)fu7fPt830@+o|=t z4xdo#W07``FX{a{%RYVcH|BeN$bQ+5?#csFaw;Msr`KF?kU@ueYu>x zghn6xHN`#S8&z@l7Ut%HL<-mRW)nlsf4c>m^V)LD+}X1wpI?_KkJTmO!=>gRx- z-~H}ixpk8Kz9{~Jv2V#`*PG|Ze(6Rw(o|#Em~+5yVoB-yHC9U zE_din=eAL8NTsK=aZWf+OJL{Zs(4Vnr1x{gZBrR|ic(P7z6>cMLLO=+8w3{h1+k{ z)$c+5{-mg*M=Jdbmy&R!YA=SG1jp zQqt>Cz1~#z4tm~0!S0i<{V4sJl~UrR@_T1=X~mIhU!u&n98Jp2`o>C0g;R&Y#(5Oe z+yeF^4-ev?SxEMEIM;A7=jHkwOz($RZY#&O4Zde`v*q$FCRuS&Wy>#W7jrEDAk#FJs!jcR5e~rakO$-C*BKBi4)mg)Xh){7d$h4RHNJ zBZvCPdY}6_8ukV3vvD^p$o}9UWk;djcU)uUM>!y%aX*q|AD$|V-i`_VLTy*Z&ToC) zwx_lum5=I0951#b&t?v6zhK9R@y|KlJcp!}6^-XQ4T*%+M!EWy@ztfux9r=yKj*@- zJL9;DI=wN^3)Om@v!rAQ8Pp~#7b^9K$N8jN>|B3m(yBotsQuJrH>$bPVs}y5v}56t z?=$0v(w_9bng0F8zc$aW^!rbZKWe@c+Y}#vuxpw0PJjNjj&lsaV|w094qs;O<7#`k za+kmND>ux}vaj6J!Q3y@?W~>?r^^+OaiZEM*82%v8ugcrANFo{XM6fv?YQ8Gzvm2S zQJAw&DO2lntE>NGf1vgQ+m0~Lb7F6tl?j+Wr==l-jQ1(4_YOBsG0%(Ys`qxZI^WRe z5&myXJ4R}`tN+b6fm~6I$0Eb9eK}|q2lfBj@%51voXb2%Wk2&AiuL;`dG%3+Rvj2k#elxfExCLnIY^0 z>0!d-VoSX_uifp)mow+FU;Xu&$E+C4J}@$oefiSne(?W{ulogNVP{jfi;Db!naqxj ztdzQBGxyc>b3xzlx+5_=s<#IA5fe_HJb6mD3-)_C<-I})u75toNjJ4Qa6eb8A{VJ{ zxN+V`YQ1b(Hy3c8o7x~cjg?aPx=#;MS_38go+^>e$+O@Og^5(uPyZhukUcl=<_55m%hk9!Mqvyvp znsi+}c)}VEjF>BiOI$$hJ8FA}d`EX}EP2I^nqR~=W3JO2SK(@jD@k7LIez=yPv5_D zwAmkeok!(-cQ>E#w<`xuYMX9g&f(PHg0Vl%uB}T4B=u>;h0S%6<@r@qoxc^?RT~=+ zRoKfjDg5FNjH{|{wk=0wrG$R69L~xG-e-QAchUw5&RnKhYLmX4{T|z9?%a>sx`#bF zmM^>Br)E7bPyW|@K6*W05x0T&QP`D&1MSA;XQ!MAE>S1$Q{%?wev7-yw_ykV9?VK9 z$;sGXW#^ZDV>|86L;jT~cP+5K8nqm{|L>J93psc7w%b>Q&9l7Ee}AtPYuh$=&r_LM zDXIMi%f3UgoxATg`&pHhQjnUrDMY@arPH^r8lRheu0JXr zm*0;jWZH#kNmYyeMS(lR`^S2hx9p2Y|2VO?$c$@TY-#bsd!N*%#C$joJg|Z@BwQUl zrdMOu?<3FlKenCp&A>rrWI0!qoy_+n1*V#ISw+^)d+PJ9F!kF5v&^^a*gLKd4%qUt zYnsjcz;2T{pSrG`1D?H$-#>%e6*cz>L%+dnV$Lmne$Dy!-LA83Vg;OUkIa<1poL|< zkUJ@$;-=f%!k2iC&n@%1zd>Z-Z8OBPf&}7r@!DX?1wXAunTlT|B>@w^> z*n8^HXQ|(BQ2y}CXGUgu%Sy@Dd>^w?s&)0&<-66>vr;Pfw%qFd{l-#hSD5$J`^Ih! z9kbQwcZ+jF!Ytm$Oppxy-OfWmofj z-?G17=DC>v`Lg@>VL1+G_h#pvzG=4~4lqqI%D}e21CQ_{miG z)rS{Dj#OaZ`e9T5iGG{i(sGBQloQZs>fbAT}oUi+MF6~Htk~d zJh7zLsiQq>mEm0HclJK}_5&5LD|71P>MWdN*vQuDUe)6a_7~gxrH&?TpQ@B9PmCLF z>kPZx1p_aJhty&3w3xg3?zg3F=`wcP>~%<=%W&hp#{8Tr+#ZSrj|=DAJ^s_1HtE^L zw5#BfvyJZ?C~`WEC(B(X-=-7RUTNC$&yBToD_ZrYn#%36tvm2 z?|09z`Qsm5us`QsBdTs!Kh79_9uwe$r0rhybvAGcf9HCoJL&P2x&Mc(!M*&cS8KaT4r%jPore3K$-nD(|MGZnsj?Aiq`H5ISTEOSZigIo5)>&`XTQ)>N0>sqIX z;W*Va%o_@GsG6tu4DXux&za9y&v%CR$MMF?udLS>*}pa&as1K%R!Z&Ju)napx8l~V z+qLp^BTGNUR}-`P1g@6eJTGeFVeWUt&KU*+)^;?!skz^!?bMsJO)#0(vJr_5O!)}G!TB<&JM6Y)%S5VIwbHA;JIu8qSSn#3KhEdcjr);kjc95nurHB2>_)z*l&UoB=fM{(+GlE!sx<6-X!f@6r!-4d zN+3CR?yHjWQ~W9O9Mffa_`Pa(&a>B&^qZpBW#p{O_g-i?a*TeDacOg2 zPu^P=^groeF7@Z9^1n9Md_rmWdKz|%ZXb@E8auuOw^aS*UlpsPbg;KwfK&AUv4^$T zv+DhUh>UWa+0GK)W)N-{?6r0K5?4K4h9@-XbJjB5&2@tBVq+dgS}PPZ&yPLtG=3M8 z-&XH`Q%vg zlJAWrEB$89Vx`nzGOQs&Gjch1^Sxi@U!&iNros%p$w}&6Sf(w#*fk)&@zY0`f3KC` zmQ*YQzH&(K%u4@%_qUiU<{&<7muo<13N>Pno29mw3yP~~tFe*C#Ez;WhC4yot9`$S3%sqp{tkN1V4Vp8f|_v3|ZJRP~Tqw)O!m0r3?3>vw# zmvNv#?RFUBfwY}@Am0A?fPkW7FgqQ5lCu1A%yWO+ELKYHl}hX@wc#==C7(-)hl+Q* z&Pu5q;pq_(%}Ocvx^??A#TI3yl$JKd7@wgoH@h_s#d)flAL;$|xH)eccA0dWD=GWL zyeC6Wd-%1X`Z2p?VZjl$0atV1UQ2N?SY_4R!_GyVQsW*}p(OuUY=BnQ`$pJ*j5PM) zT|JL>c>dQ)D&Q-hZ}g%%M{o-3_Jd29=T8`x=y z%P&v}7&lhw%@|LSsHuH|7Y&Y+C>0Gmik|qEC3^PCW zdd3+mWgC1Z?K5XC61DWrkU&y?N*jxA#&=-)`QmzHc#b8BJy|Jvb($t0I8fC03Ddo< z&ESH`_s-{>+?G8m&z?GTp%3Snzw_%Q0%|l zDcA07>=19-zv=aQMAKar-<_F7-Mbj}73|*+=1JA(Q1~8O-?=XbP&r%kyA_>2F#@}Z zHg&kX+Sk-}XgLoyUHGBuOZKxj?yKdwGb3!49AH@9|3N2>eNQf6+T~N*BIfsO8G7ET z-s_x)rC*gDFy*S;J}S?yGiqdn;1oh0+$sS-yK5j*l+$mZyQ_Gw9oM~_anGmV3TYsCwJBBermo{%vgPO`K;}% zi#X%-Ed3!-ii-+DXl}9A1r$Xv_I|xfNpYLi#&l%X?gn2g~pOf;zX;<3y`} z2{Hff&Q7IrPx>^tFuPWM-Tce(#`wPQ)yy7Uo>OG@6>YBsmtgN{(GBahY|q}qVjk97 z5>MgxGW19py_O5tFu(Wp&cEHrw?6rL_ii*aFbAh+?^UNmhEw^#SlNis_T!wP&McpKZWT$} zlgEGT`)F~Ib^3gmUEdw8badK!avxtDA0(u*QmXx>?yN`SO0oJLOrNi@TfmfYeoeDc zr=>EjETzk#+Ix+CVb0rP$}#7`>n!CKZD86@IE}zlHeezvrBVUdrK@wBb02(@^=lcpdY>x_T0uwEp{g*^0(}qJ$WiCrFMbxIamnk{O#Q3fW?U$PczR| z>&@Joh!b6kJ2)sId#`a8`wq9H%e}kw{fEK>jqeLNK%IYi%sl??%9nU9W`6HMmG;M7 zfAG&CF7fEL9H3kF#lj=xvtE1F`$THpX<2{P9p4iZxmK3_1(nXS-qGI^w<$kqT-V#} z^nR`7enqP1i$2#$n>X(sR#aT}4XhkJig4j^=HF@1&z(WdV1_Z^mA1#As_o^on40=M zNSzXmeKNhiR!XC1o9CrT zHDD%x`+~)eu48WVyKenHE8XXeht2B46yN*y^hJMMwtP3OzvEB3Y3>JTyEDD6aKDsT z>Erf``dpfGCsxUs{cr_Jn`6#r0!=#u%X=&8skxs}`z>BKYcA`3Y|Fl8yP;FDu~C4M zqhaNY??WQgd3K(nBL@d2AEhaY_&odUBa0on*Yc^gs#e@g$}XMS1ep6x_09W9T7N&M zY}3)SkJtULzDpr}AL=cd+i+8#U;6uIN?SD3*bg8zuhZi`<%_ZZrq4mCPjs=~B_|)E z(l#PdR>oozpH&=7NfK74%nlr8hZE$4d;#+dswHKtZ+SaQWacA1+kF#r0$ zIsC!=)?Z3B;h+!3{D;aP%)0N+3Szfw_xFCuU5b5_{Yl;zB^NZTRGw;2F~3Jq--#3T zyC8;{tjE!ip}%lkIFQx2rTD4~7Y5fqMY{YJez5-OG>1!?_NE-N#phk2v;KN~Q*0k& z|BiJ2f!WRU`n;#)9`m~!54B#h$$D~Vjc)xom-!t#RqijxV>vmZdsxDEzk1|*C_#%f(f zT0`4lI=T56JLh$BYtYw&v{r0kJh%1tVC-D*&s6Me>))Ma+|)a`OAK?kU;$&?QdmjD z{+slCU7wrAmD?^K*phmFuzrp>o8L#XQqteCDgBRqn5N<8bHsVh$MN?+ZQ)W;rX7!u zX)k8^`!PjV8-KR}-$$3P#3cerrKhs(jdK}RN@+Wcb$;p>_iQh2Wtph$3^`X|S3O*@d{KC8PP>++cgv^LM5 zDaL%iG%@$LxxtFurSCt_%Q<0G(fI2hlwf|pV)>3GyoU3K+{ZSvQgTk2?G-U|HQPnq z+Mdw(VCoN#C7C{YXP^Z0_s2>}+fCN=`X+xkhSmF$v>mxps<#kdF^APA^_>{S&B5{4 z&@|5T@K(3dgTgsq+qO-nuAglAPTtQB)_VQNrhfaH{Sd_zz;@t?Y4}dWw6oUdH?0Sx+!(*1E0=dP?6oN7s`>qw z`nwoh@&v5@I_;Z^4f{ixW#v2t+BKHKoMWc}q-V!ZKP4g38t?Ur5g;q%lLPpEpZ z`S&F9seNT_k5p8BkFS){Rot|7XMzhWrTRJXx%Hvv9Ae&A_Vyk*yt(IQs>>g*N#Of&slE!_?sWQ1f^7e(|tbqW;E@H)b}owQd`Vv z269W&-i^BNHvc|{yMJtvg%$U5Vv_kCueMVympL^0&-S0$C2&Ok+b(mpT`Oz*);hyY zI|9AWTX0SO^?SFR;zAzSKAU-oUGH=LlqDCa(s}%>625cEb)P1``tXvoJSu%T@&0(q zn60&^$FXm$lq%aA-@lS-2kCl$-*-{vP~J=}n-3{;jjEo+SNn;LY48Zc{)|d&G3;lk za7X-klr5XMVE3?ykm5Ty!;`<)Mm9V~NpJ9HD_X%`RDGXcEFOQ(Cv80E*i(FP=W0*b z#T{l;D?d|2Im6C@iha5lv_0t^7o2xo4yq~F`c^N}9^|6r%%)woKA+5&)bLk8&LSLG z!T4T8pJ!9x&}Ue<9MtFdY`1;nzTCr_vu)A66EZ)|_aF9tC3XeK-}mC&<*~h9WFcv7 z^q2YfL|o%mJm|Y~00p=k_HdlTe(mNI*RtfAtN+*P$yZq^X}f_c9+i7vOYCadf70j1 zr7Q1U+ctMED&EWZo|HYzeFReNBi#==+_#bW98Z_NWsT9@;waz9p#053>#P( zb{xu1-?E>j_mA~=NNJPJ^@2Y4*Yn7-E&e%pXm(9jN_yW_&%d>npJMFyQI#Ap^LjVc KQjgk?{C@yzp%rcb literal 0 HcmV?d00001 diff --git a/tests/data/1138_bus.info b/tests/data/1138_bus.info new file mode 100644 index 0000000..270767f --- /dev/null +++ b/tests/data/1138_bus.info @@ -0,0 +1 @@ +-matload_block_size 1 From d7ba6834c9070964bb3a2c92ca64d4a0162c219b Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 22:39:13 +0000 Subject: [PATCH 24/41] Add higher order test that fails in Loe --- tests/Makefile | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/Makefile b/tests/Makefile index 355816e..e814af8 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -148,10 +148,14 @@ run_tests_load_serial: done # @echo "" - @echo "Test Newton GMRES polynomials matrix-free with added roots in market matrix problem 1138" + @echo "Test Newton GMRES polynomials order 60 matrix-free with added roots in market matrix problem 1138" ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 60 -pc_pflareinv_matrix_free -ksp_norm_type unpreconditioned -ksp_max_it 6 - @echo "Test Newton GMRES polynomials with fixed sparsity with added roots in market matrix problem 1138" - ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 60 -ksp_norm_type unpreconditioned -ksp_max_it 6 + @echo "Test Newton GMRES polynomials order 60 with fixed sparsity with added roots in market matrix problem 1138" + ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 60 -ksp_norm_type unpreconditioned -ksp_max_it 6 + @echo "Test Newton GMRES polynomials order 120 matrix-free with added roots in market matrix problem 1138" + ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 120 -pc_pflareinv_matrix_free -ksp_norm_type unpreconditioned -ksp_max_it 5 + @echo "Test Newton GMRES polynomials order 120 with fixed sparsity with added roots in market matrix problem 1138" + ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 120 -ksp_norm_type unpreconditioned -ksp_max_it 5 # ~~~~~~~~~~~ # ~~~~~~~~~~~ From 0f15e4cd849e5976bba40959a224f24ff91182ab Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 23:17:00 +0000 Subject: [PATCH 25/41] Add better comments for Newton GMRES polynomial --- src/Gmres_Poly_Newton.F90 | 217 +++++++++++++++----------------------- 1 file changed, 85 insertions(+), 132 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 66bc5c8..c389b08 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -139,7 +139,7 @@ end subroutine modified_leja subroutine cluster_eigenvalues_stable(real_roots, imag_roots, rel_tol, abs_tol) - ! Robust clustering of (possibly complex) harmonic Ritz values. + ! Clustering of (possibly complex) harmonic Ritz values. ! Numerically distinct clusters are moved to the front. ! Remaining entries are set to zero. ! Skips eigenvalues that are exactly zero (both real and imag parts). @@ -169,7 +169,7 @@ subroutine cluster_eigenvalues_stable(real_roots, imag_roots, rel_tol, abs_tol) n_unique = 0 ! --------------------------------------------------------- - ! All-pairs clustering (no sorting to preserve proximity) + ! All-pairs clustering ! --------------------------------------------------------- do i = 1, n @@ -249,7 +249,7 @@ subroutine compute_extra_roots(real_roots, imag_roots, real_roots_output, imag_r ! of roots that have large products (to improve polynomial stability) ! Only non-zero eigenvalues should be passed in ! real_roots_output, imag_roots_output are allocated and filled with the original - ! roots plus any extra copies, with perturbed values for the leja sort + ! roots plus any extra ! ~~~~~~ PetscReal, dimension(:), intent(inout) :: real_roots, imag_roots @@ -470,8 +470,9 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots ! What we find is that when use this to compute eigenvalues we find e-vals ! as we might expect up to the rank ! but then we have some eigenvalues that are numerically zero - ! We keep those and our application of the newton polynomial in - ! petsc_matvec_gmres_newton_mf and petsc_matvec_gmres_newton_mf_residual + ! Given the way the outside code is structured, we can't lower the poly_order + ! in this routine and return + ! Instead we keep the "zero" eigenvalues our application of the newton polynomial ! just skips them and hence we don't do any ! extra work in the application phase than we would have done with lower order @@ -543,16 +544,15 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) end if - ! print *, "coefficients r", coefficients(:, 1) - ! print *, "coefficients c", coefficients(:, 2) + ! ~~~~~~~~~~~~~~ + ! Now we have to check the output eigenvalues + ! ~~~~~~~~~~~~~~ ! These are the tolerances that control the clustering H_norm = norm2(H_n(1:m,1:m)) rel_tol = 1.0d0 * sqrt(epsilon(1.0d0)) abs_tol = epsilon(1.0d0) * max(H_norm, beta) - !print *, "H_norm", H_norm, "rel_tol", rel_tol, "abs_tol", abs_tol - ! In some cases with numerical rank deficiency, we can still ! end up with non-zero (or negative) eigenvalues that ! are trivially small - we set them explicitly to zero @@ -562,16 +562,18 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots coefficients(i_loc, 1) = 0d0 coefficients(i_loc, 2) = 0d0 end if - end do - - ! print *, "after zero coefficients r", coefficients(:, 1) - ! print *, "after zero coefficients c", coefficients(:, 2) + end do + ! ~~~~~~~~~~~~~~ ! Cluster close eigenvalues together to improve stability of the polynomial evaluation + ! For example when computing the e'vals of a constant diagonal matrix + ! the rank revealing factorisation above doesn't always report a rank of 1 given roundoff + ! Instead it returns multiple eigenvalues that are very close to each other, + ! and we want to cluster those together and treat them as one root + ! ~~~~~~~~~~~~~~ + + ! This places all exactly zero eigenvalues at the end of coefficients call cluster_eigenvalues_stable(coefficients(:, 1), coefficients(:, 2), rel_tol, abs_tol) - - ! print *, "after cluster coefficients r", coefficients(:, 1) - ! print *, "after cluster coefficients c", coefficients(:, 2) ! ~~~~~~~~~~~~~~ ! Extract the non-zero eigenvalues for root adding and leja ordering @@ -676,10 +678,7 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots end if - end if - - ! print *, "after root adding and leja coefficients r", coefficients(:, 1) - ! print *, "after root adding and leja coefficients c", coefficients(:, 2) + end if ! Cleanup do i_loc = 1, subspace_size+1 @@ -1109,25 +1108,32 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! In the mononomial case we just compute the matrix powers up to poly_sparsity_order ! and add them times the coefficients to cmat ! Here though we have to build the Newton basis polynomials - ! The complex conjugate roots are tricky as they build up two powers at a time - ! The powers higher than poly_sparsity_order can be done with only - ! a single bit of comms and is done below this + ! As a rule, the value input here into cmat is correct up to the power + ! of poly_sparsity_order, with all other terms being added in the fixed sparsity loops + ! below + ! + ! For complex conjugate roots, two terms are computed at a time, but if the sparsity order + ! falls in between those two roots, only part of the output cmat is included + ! Any remaining terms are output in either mat_sparsity_match or mat_product_save depending + ! on the case + ! mat_sparsity_match has either temp or prod in it depending on the case + ! mat_product_save only exists in some cases and stores prod from the previous term + ! mat_sparsity_match and mat_product_save are always output with the sparsity of sparsity order + ! + ! status_output is an array of length the number of roots + ! and has a 1 in each position if that term has been added to the output + ! It just helps us keep track of what has gone into cmat and what hasn't + ! It breaks up complex conjugate pairs into the first root (i) which has the same power as prod + ! tmp = 2 * a * prod + ! p = p + 1/(a^2 + b^2) * tmp + ! and the second root (i+1) which has the same power as prod * A: + ! tmp = -A * prod + ! p = p + 1/(a^2 + b^2) * tmp ! ~~~~~~~~~~ + output_first_complex = .FALSE. if (poly_sparsity_order == 1) then - ! If we've got first order sparsity, we want to build cmat up to first order - ! and then we add in higher order powers later - ! We can just pass in the first two roots to build the first order gmres polynomial - ! mat_sparsity_match gets out the parts of the product up to 1st order - ! for the real case this will be the equivalent of prod on line 5 of Alg 3 in Loe 2021 - ! I - 1/theta_1 A - ! whereas cmat will be 1/theta_1 + 1/theta_2 * (I - 1/theta_1 A) - ! For the complex case we instead pass out tmp from line 9 scaled by 1/(a^2 + b^2) - ! as this is the part of the product with sparsity up to A - ! This is because the prod for complex builds up the A^2 term for the next iteration - ! given it does two roots at a time - ! If we have a real first coefficient and a second complex ! we can't call build_gmres_polynomial_newton_inverse_1st_1st as it is only correct ! for valid coefficients up to 1st order (ie both real or both complex) @@ -1137,6 +1143,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, & status_output, mat_product_save) + ! Valid 1st order polynomial, so this case is easy else ! Duplicate & copy the matrix, but ensure there is a diagonal present @@ -1149,30 +1156,15 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp end if else - ! print *,"reals", coefficients(:,1) - ! print *,"imags", coefficients(:,2) - ! If we're any higher, then we build cmat up to that order ! But we have to be careful because the last root we want to explicitly - ! build up to here (ie the power of the matrix given by poly_sparsity_order) + ! build up to here (ie the power of the matrix given by sparsity_order) ! might be the first root of a complex conjugate pair - ! In that case cmat only contains part of the result up to poly_sparsity_order - ! Similarly mat_sparsity_match contains the product up to poly_sparsity_order - ! The rest gets added in below - ! output_first_complex records if poly_sparsity_order hits the first root - ! of a complex conjugate pair, as we need to know that below to add in the rest - ! of the poly_sparsity_order+1 term from that pair - ! before moving on to the rest of the higher order roots call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, & coefficients(1:poly_sparsity_order + 1, 1:2), & cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, & status_output, mat_product_save) - end if - - ! print *, "status output real", status_output(:, 1) - ! print *, "status output complex", status_output(:, 2) - - ! print *, "sum", sum(status_output, 2) + end if ! We know we will never have non-zero locations outside of the highest constrained sparsity power call MatSetOption(cmat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) @@ -1423,40 +1415,30 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp term = poly_sparsity_order + 1 skip_add = .FALSE. ! If the fixed sparsity root is the second of a complex pair, we start one term earlier - ! so that we can compute the correct part of the product, we just make sure not to add + ! so that we can compute the correct part of the fixed sparsity product, we just make sure not to add + ! anything to cmat as it is already correct up to the fixed sparsity order if (coefficients(term,2) /= 0d0 .AND. .NOT. output_first_complex) then term = term - 1 skip_add = .TRUE. end if - !print *, "starting loop at term ", term, "skip_add ", skip_add - ! This loop skips the last coefficient do while (term .le. size(coefficients, 1) - 1) - !print *, "term ", term, "coeff ", coefficients(term,1), coefficients(term,2), skip_add - ! If real if (coefficients(term,2) == 0d0) then + ! Skips eigenvalues that are numerically zero - see + ! the comment in calculate_gmres_polynomial_roots_newton if (abs(coefficients(term,1)) < 1e-12) then term = term + 1 cycle end if - !print *, "REAL CASE assembly", term - ! ~~~~~~~~~~~ ! Now can add the value to our matrix - ! Can skip this if coeff is zero, but still need to compute A^(term-1) - ! for the next time through - ! Also we skip the first one if we're real as that value has already been added to the - ! matrix by the build_gmres_polynomial_newton_inverse_full (as we had to build the product up - ! to that order) ! ~~~~~~~~~~~ if (ncols /= 0 .AND. status_output(term, 1) /= 1) then - - !print *, "ADDING IN REAL TERM ", term call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & 1d0/coefficients(term, 1) * vals_previous_power_temp(1:ncols), ADD_VALUES, ierr) end if @@ -1464,16 +1446,13 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Initialize with previous product before the A*prod subtraction vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols) - !print *, "DOING REAL PRODCUT for term ", term - - ! Have to finish all the columns before we move onto the next coefficient + ! This is the 1/theta_i * A * prod but where A * prod has fixed sparsity do j_loc = 1, ncols ! If we have no matching columns cycle this row if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle ! symbolic_vals(j_loc)%ptr has the matching values of A in it - ! This is the (I - A_ff/theta_k) * prod vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) - & 1d0/coefficients(term, 1) * & symbolic_vals(j_loc)%ptr * vals_previous_power_temp(j_loc) @@ -1484,23 +1463,23 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! If complex else + ! Skips eigenvalues that are numerically zero - see + ! the comment in calculate_gmres_polynomial_roots_newton if (coefficients(term,1)**2 + coefficients(term,2)**2 < 1e-12) then term = term + 2 cycle end if - !print *, "COMPLEX CASE assembly", term - square_sum = 1d0/(coefficients(term,1)**2 + coefficients(term,2)**2) + + ! If our fixed sparsity order falls on the first of a complex conjugate pair if (.NOT. skip_add) then ! We skip the 2 * a * prod from the first root of a complex pair if that has already - ! been included in the inv_matrix from build_gmres_polynomial_newton_inverse_full + ! been included in the cmat from build_gmres_polynomial_newton_inverse_full if (status_output(term, 2) /= 1) then - !print *, term, "adding in 2a prod" temp(1:ncols) = 2 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) else - !print *, term, "skipping adding in 2a prod" temp(1:ncols) = 0d0 end if @@ -1521,69 +1500,57 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp square_sum * temp(1:ncols), ADD_VALUES, ierr) end if - ! for (r, c, c) - ! problem here is 2 *a * prod has been added to inv_matrix but we need to have added - ! 2aprod/a^2+b^2 - ! for (c, c, r) mat product is output without the 1/a^2+b^2 but that is fine as we - ! compensate for that in the product + ! Here we need to go back in and ensure 2 *a * prod is in temp if we skipped it + ! above. We know it is already in cmat, but it has to be in temp when we + ! do the next product if (status_output(term, 2) == 1) then if (output_first_complex) then - !print *, "ADDING IN 2a prod second time for term ", term temp(1:ncols) = temp(1:ncols) + 2d0 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) end if end if - ! First time through complex pair + ! If our fixed sparsity order falls on the second of a complex conjugate pair else - !print *, "SKIP ADDING IN COMPLEX TERM ", term - !@@@ for the case where we have (r, c, c, ....) and second order sparsity - ! i think the problem is that we have to skip adding anything to p as inverse_matrix - ! already has the correct values in it, as we computed tmp which will have 2nd order terms - ! in it, but we skipped the product in the full, which is correct as that would compute 3rd order - ! terms. so the thing that gets output in mat_prod_or_tmp is tmp - ! + ! In this case we have already included both 2*a*prod - A * prod into cmat + ! But we still have to compute the product for the next term + ! The problem here is that mat_sparsity_match has temp in it in this case, not + ! the old prod from whatever the previous loop is + ! In that case build_gmres_polynomial_newton_inverse_full also outputs + ! mat_product_save which is the old value of prod but with the sparsity of + ! mat_sparsity_match (with zeros if needed) - ! If we're skipping the add, then vals_previous_power_temp has all the correct - ! values in it for temp - ! All we have to do is compute prod for the next time through + ! This case only occurs once for each row, so once we've hit this + ! we will always have our correct prod skip_add = .FALSE. - !@@@@ so then this line sets temp to be tmp + ! temp is output into mat_sparsity_match in this case temp(1:ncols) = vals_previous_power_temp(1:ncols) - ! @@@ have to be careful here! - ! If we've gone back a term, we don't have anything in prod - ! prod is I when term = 1 - ! @@@@ if we're doing this for the first time, we know product is I - ! so we just set prod to be I - ! @@@@ the problem is if we're not doing this for the first time - ! we need to know what prod had in it from the previous time, as our full - ! is only outputting prod or temp, not both, because at lower order when we output - ! temp in this case we knew prod was I so we didn't have to store both - ! in the (r, c, c) case prod will have been I - 1/theta_1 A_ff from the r - ! but for it to work with the loop below vals_previous_power_temp has to contain that but - ! over the sparsity of the 2nd order term. + ! If sparsity order is 1, the previous product will have been the identity + ! and we don't output it into mat_product_save because that is a trivial case + ! we can do ourselves if (term == 1) then vals_previous_power_temp(1:ncols) = 0d0 if (diag_index /= -1) then vals_previous_power_temp(diag_index) = 1d0 end if + ! In the case the mat_product_save is not the identity, we need to pull it's value out - ! We only do this once for the first term in this case else call MatGetRow(mat_product_save, i_loc - 1 + global_row_start, ncols_two, & cols_two_ptr, vals_two_ptr, ierr) - ! We have guaranteed in the full version that mat_product_save has fixed sparsity + ! We have guaranteed in the build_gmres_polynomial_newton_inverse_full + ! version that mat_product_save has fixed sparsity vals_previous_power_temp(1:ncols_two) = vals_two_ptr(1:ncols_two) call MatRestoreRow(mat_product_save, i_loc - 1 + global_row_start, ncols_two, & cols_two_ptr, vals_two_ptr, ierr) - end if end if + ! Now we compute the next product if (term .le. size(coefficients, 1)- 2) then vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols) @@ -1605,18 +1572,20 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp end if - ! This should now have the value of A^(term-1) in it + ! This should now have the value of prod in it vals_previous_power_temp(1:ncols) = vals_power_temp(1:ncols) end do ! Final step if last root is real if (coefficients(term,2) == 0d0) then if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12) then - !print *, "adding REAL final term ", term, " coeff ", coefficients(term,1) call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & 1d0/coefficients(term, 1) * vals_power_temp(1:ncols), ADD_VALUES, ierr) end if end if + + ! ~~~~~~~~~~~~~~~ + do j_loc = 1, ncols if (associated(symbolic_ones(j_loc)%ptr)) then deallocate(symbolic_ones(j_loc)%ptr) @@ -1646,6 +1615,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Delete temporaries call MatDestroy(mat_sparsity_match, ierr) + !call MatDestroy(mat_product_save, ierr) if (deallocate_submatrices) then deallocate(reuse_submatrices) reuse_submatrices => null() @@ -2104,9 +2074,10 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi status_output, mat_product_save) ! No constrained sparsity by default - ! If you pass in mat_prod_or_temp, poly_sparsity_order, output_first_complex + ! If you pass in mat_prod_or_temp, poly_sparsity_order, output_first_complex, status_output and + ! mat_product_save ! then it will build part of the terms, up to poly_sparsity_order, and return the product - ! in mat_prod_or_temp that you need to compute the rest of the fixed sparsity terms + ! in mat_prod_or_temp and mat_product_save that you need to compute the rest of the fixed sparsity terms ! ~~~~~~ type(tMat), intent(in) :: matrix @@ -2158,9 +2129,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! where ^r means a purely real root and ^c means a complex root ! want poly_sparsity_order = 1, we can't process all the way up to theta_3^c as that would ! compute up to an A^2 term which is beyond our sparsity constraint - ! So we just check if the last root also has it's complex conjugate present - ! This will never happen in any context except when we are outputting the product - ! as part of a fixed sparsity multiply ! i_sparse tells us how many roots we are going to process ! Normally this would just be size(coefficients, 1) and the loop below goes up @@ -2184,8 +2152,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi i_sparse = size(coefficients, 1) first_complex = .FALSE. - !print *, "size coeffs", size(coefficients, 1), "coeffs", coefficients(:, 1), coefficients(:, 2) - if (output_product) then output_first_complex = .FALSE. @@ -2217,8 +2183,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi first_complex = output_first_complex end if - !print *, "i_sparse", i_sparse, "output_first_complex", output_first_complex - ! ~~~~~~~~~~~~ ! Iterate over the i ! This is basically the same as the MF application but we have to build the powers @@ -2228,8 +2192,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We're always building up the next product do while (i .le. i_sparse - 1) - !print *, "i = ", i - ! Duplicate & copy the matrix, but ensure there is a diagonal present ! temp_mat_A is going to store things with the sparsity of A if (PetscObjectIsNull(temp_mat_A)) then @@ -2242,8 +2204,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! If real this is easy if (coefficients(i,2) == 0d0) then - !print *, "real", "i_sparse", i_sparse - ! Skips eigenvalues that are numerically zero ! We still compute the entries as as zero because we need the sparsity ! to be correct for the next iteration @@ -2286,7 +2246,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i == i_sparse - 1) then - !print *, "outputting product in real case", "i_sparse", i_sparse, "i", i call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if @@ -2295,8 +2254,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Complex else - !print *, "complex", first_complex - ! Skips eigenvalues that are numerically zero if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then square_sum = 0 @@ -2346,9 +2303,9 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi MAT_INITIAL_MATRIX, 1.5d0, temp_mat_two, ierr) end if - ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply + ! We copy out the last part of the old product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i > i_sparse - 2) then - !print *, "outputting TEMP in complex case", "i_sparse", i_sparse, "i", i + call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) ! If i == 1 then we know mat_product is the identity and we don't bother ! to write it out, we just have some custom code in the product given its trivial @@ -2374,8 +2331,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi if (i .le. i_sparse - 2) then - !print *, "doing complex matmult step" - ! temp_mat_three = matrix * temp_mat_two call MatMatMult(matrix, temp_mat_two, & MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr) @@ -2396,7 +2351,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. .NOT. first_complex) then - !print *, "outputting product in complex case", "i_sparse", i_sparse, "i", i call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if @@ -2419,7 +2373,6 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Skips eigenvalues that are numerically zero if (abs(coefficients(i,1)) > 1e-12) then - !print *, "doing last real step, adding in term", i, "coeff", coefficients(i,1) if (reuse_triggered) then ! If doing reuse we know our nonzeros are a subset call MatAXPY(inv_matrix, 1d0/coefficients(i,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) From 85015cfd08ff3df183229eaec03ee5f2a10bc215 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 9 Feb 2026 23:47:05 +0000 Subject: [PATCH 26/41] Fix memory leaks --- src/Gmres_Poly_Newton.F90 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index c389b08..aecb590 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -1615,7 +1615,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Delete temporaries call MatDestroy(mat_sparsity_match, ierr) - !call MatDestroy(mat_product_save, ierr) + call MatDestroy(mat_product_save, ierr) if (deallocate_submatrices) then deallocate(reuse_submatrices) reuse_submatrices => null() @@ -2236,6 +2236,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! mat_product_k_plus_1 = mat_product * temp_mat_A if (i == 1) then ! If i == 1 then we know mat_product is identity so we can just copy + call MatDestroy(mat_product, ierr) call MatConvert(temp_mat_A, MATSAME, MAT_INITIAL_MATRIX, mat_product, ierr) else call MatMatMult(temp_mat_A, mat_product, & From 4b0d5dda6edd0707d6bcdc1e7581fb2f3c1a818f Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 00:11:56 +0000 Subject: [PATCH 27/41] Add specific test with diagonal matrix for different GMRES polynomials to test they can handle solving a problem where a lower order polynomial is an exact solution. --- Makefile | 2 +- tests/Makefile | 10 +++- tests/mat_diag.F90 | 119 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 2 deletions(-) create mode 100644 tests/mat_diag.F90 diff --git a/Makefile b/Makefile index 87fe1a6..87f3926 100644 --- a/Makefile +++ b/Makefile @@ -138,7 +138,7 @@ OBJS := $(OBJS) $(SRCDIR)/PETSc_Helper.o \ $(SRCDIR)/PCPFLAREINV.o # Define a variable containing all the tests -export TEST_TARGETS = ex12f ex6f ex6f_getcoeffs ex6 adv_1d adv_diff_2d ex6_cf_splitting adv_diff_cg_supg matrandom matrandom_check_reset ex12f_gmres_poly +export TEST_TARGETS = ex12f ex6f ex6f_getcoeffs ex6 adv_1d adv_diff_2d ex6_cf_splitting adv_diff_cg_supg matrandom matrandom_check_reset ex12f_gmres_poly mat_diag # Include kokkos examples ifeq ($(PETSC_HAVE_KOKKOS),1) export TEST_TARGETS := $(TEST_TARGETS) adv_1dk diff --git a/tests/Makefile b/tests/Makefile index e814af8..a1b773d 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -450,7 +450,15 @@ run_tests_no_load_serial: ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ -pc_air_inverse_sparsity_order $$sparsity -ksp_norm_type unpreconditioned -ksp_max_it 5; \ done; \ - done + done +# + @echo "" + @echo "Test Newton GMRES polynomial on a diagonal matrix" + ./mat_diag -ksp_max_it 1 + @echo "Test Arnoldi GMRES polynomial on a diagonal matrix" + ./mat_diag -pc_pflareinv_type arnoldi -ksp_max_it 1 + @echo "Test Power GMRES polynomial on a diagonal matrix" + ./mat_diag -pc_pflareinv_type power -ksp_max_it 1 # # ~~~~~~~~~~~~~~~~~~~~~~~ # Include kokkos examples diff --git a/tests/mat_diag.F90 b/tests/mat_diag.F90 new file mode 100644 index 0000000..14da617 --- /dev/null +++ b/tests/mat_diag.F90 @@ -0,0 +1,119 @@ +#include +#include "finclude/pflare.h" + use petscksp + implicit none + + ! Test that the gmres polynomials can handle small solve of diagonal matrix + ! We leave the polynomial order here as the default (which is 6), despite + ! the fact that much lower polynomial order is an exact solve in this case + ! This tests that the various gmres polynomial methods correctly + ! identify we only need up to lower order + + PetscErrorCode :: ierr + Mat :: A + PetscInt :: m, n, nnzs + PetscInt, parameter :: one = 1, two = 2, three = 3 + Vec :: x,b + KSP :: ksp + PC :: pc + PetscBool :: flg + KSPConvergedReason reason + PetscRandom rctx + + call PetscInitialize(PETSC_NULL_CHARACTER,ierr) + ! Register the pflare types + call PCRegister_PFLARE() + + m = 10 + n = 10 + call PetscOptionsGetInt(PETSC_NULL_OPTIONS,PETSC_NULL_CHARACTER,'-m',m,flg,ierr) + call PetscOptionsGetInt(PETSC_NULL_OPTIONS,PETSC_NULL_CHARACTER,'-n',n,flg,ierr) + + ! Create matrix + call MatCreate(PETSC_COMM_WORLD,A,ierr) + call MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,m*n,m*n,ierr) + call MatSetFromOptions(A,ierr) + nnzs = m; + call MatSeqAIJSetPreallocation(A, nnzs, PETSC_NULL_INTEGER_ARRAY, ierr) + call MatMPIAIJSetPreallocation(A, nnzs, PETSC_NULL_INTEGER_ARRAY, nnzs, PETSC_NULL_INTEGER_ARRAY, ierr) + call MatSetUp(A,ierr) + call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr) + call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr) + call MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE, ierr) + + call MatCreateVecs(A,b,x,ierr) + call VecSet(x, 0d0, ierr) + + ! Random rhs + call PetscRandomCreate(PETSC_COMM_WORLD, rctx, ierr) + call PetscRandomSetFromOptions(rctx, ierr) + call VecSetRandom(b, rctx, ierr) + call PetscRandomDestroy(rctx, ierr) + + ! ~~~~~~~~~~~~~~ + ! Set constant diagonal values in matrix + ! In Newton form should only need a single root + ! (ie a 0th order polynomial) for an exact solve + ! Starting with the identity, the inverse should also be the identity + ! ~~~~~~~~~~~~~~ + call MatShift(A, 1d0, ierr) + + call KSPCreate(PETSC_COMM_WORLD,ksp,ierr) + call KSPSetOperators(ksp,A,A,ierr) + call KSPGetPC(ksp,pc,ierr) + ! Set newton gmres polynomial as PC + call PCSetType(pc, PCPFLAREINV, ierr) + call PCPFLAREINVSetType(pc, PFLAREINV_NEWTON, ierr) + call KSPSetPC(ksp, pc, ierr) + call KSPSetFromOptions(ksp,ierr) + call KSPSetUp(ksp,ierr) + + ! Do the solve + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp, reason, ierr) + if (reason%v < 0) then + error stop 1 + end if + + ! ~~~~~~~~~~~~~~ + ! Instead now set the diagonal to 1.5 + ! In Newton form should only need a single root + ! ~~~~~~~~~~~~~~ + call MatShift(A, 0.5d0, ierr) + call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr) + call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr) + call VecSet(x, 0d0, ierr) + + ! Do another solve - this will automatically trigger the setup as the matrix + ! has changed + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp, reason, ierr) + if (reason%v < 0) then + error stop 1 + end if + + ! ~~~~~~~~~~~~~~ + ! Instead now have two different constant values in the diagonal + ! In Newton form should only need two roots + ! (ie a 1st order polynomial) for an exact solve + ! ~~~~~~~~~~~~~~ + ! Set one of the values to 2.5 + call MatSetValue(A, 0, 0, 2.5d0, INSERT_VALUES, ierr) + call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr) + call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr) + call VecSet(x, 0d0, ierr) + + ! Do another solve - this will automatically trigger the setup as the matrix + ! has changed + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp, reason, ierr) + if (reason%v < 0) then + error stop 1 + end if + + call MatDestroy(A, ierr) + call VecDestroy(b, ierr) + call VecDestroy(x, ierr) + call KSPDestroy(ksp, ierr) + call PetscFinalize(ierr) + end \ No newline at end of file From ffccf78cfc4d2cd60bc4a36c541741b25463175b Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 00:26:20 +0000 Subject: [PATCH 28/41] 64-bit fixes --- src/Gmres_Poly_Newton.F90 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index aecb590..40d90d5 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -1055,9 +1055,9 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp PetscInt :: local_rows, local_cols, global_rows, global_cols PetscInt :: global_row_start, global_row_end_plus_one, row_index_into_submatrix PetscInt :: global_col_start, global_col_end_plus_one, n, ncols, ncols_two, ifree, max_nnzs - PetscInt :: i_loc, j_loc, row_size, rows_ao, cols_ao, rows_ad, cols_ad, shift = 0 + PetscInt :: i_loc, j_loc, row_size, rows_ao, cols_ao, rows_ad, cols_ad, shift = 0, diag_index integer :: errorcode, match_counter, term - integer :: comm_size, diag_index + integer :: comm_size PetscErrorCode :: ierr integer, dimension(:), allocatable :: cols_index_one, cols_index_two PetscInt, dimension(:), allocatable :: col_indices_off_proc_array, ad_indices, cols @@ -1149,7 +1149,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Duplicate & copy the matrix, but ensure there is a diagonal present call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, cmat) - call build_gmres_polynomial_newton_inverse_1st_1st(matrix, one, & + call build_gmres_polynomial_newton_inverse_1st_1st(matrix, 1, & coefficients(1:poly_sparsity_order + 1, 1:2), & cmat, mat_sparsity_match, & status_output) From 725e974f4ce77fb73250080d3a6d27eaf0cc0e2f Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 00:33:51 +0000 Subject: [PATCH 29/41] 64-bit fix for new diagonal test --- tests/mat_diag.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/mat_diag.F90 b/tests/mat_diag.F90 index 14da617..1d6a54a 100644 --- a/tests/mat_diag.F90 +++ b/tests/mat_diag.F90 @@ -12,7 +12,7 @@ PetscErrorCode :: ierr Mat :: A PetscInt :: m, n, nnzs - PetscInt, parameter :: one = 1, two = 2, three = 3 + PetscInt, parameter :: one = 1, two = 2, three = 3, zero = 0 Vec :: x,b KSP :: ksp PC :: pc @@ -98,7 +98,7 @@ ! (ie a 1st order polynomial) for an exact solve ! ~~~~~~~~~~~~~~ ! Set one of the values to 2.5 - call MatSetValue(A, 0, 0, 2.5d0, INSERT_VALUES, ierr) + call MatSetValue(A, zero, zero, 2.5d0, INSERT_VALUES, ierr) call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr) call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr) call VecSet(x, 0d0, ierr) From 385ea448bc508b3578c2abcacdc07ffe2254da1f Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 02:42:49 +0000 Subject: [PATCH 30/41] Fix coefficients access out of bounds in some circumstances --- src/Gmres_Poly_Newton.F90 | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 40d90d5..12e50f7 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -553,6 +553,8 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots rel_tol = 1.0d0 * sqrt(epsilon(1.0d0)) abs_tol = epsilon(1.0d0) * max(H_norm, beta) + print *, "coeffs", coefficients(:,1), "imag", coefficients(:,2) + ! In some cases with numerical rank deficiency, we can still ! end up with non-zero (or negative) eigenvalues that ! are trivially small - we set them explicitly to zero @@ -562,7 +564,9 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots coefficients(i_loc, 1) = 0d0 coefficients(i_loc, 2) = 0d0 end if - end do + end do + + print *, "coeffs after zero", coefficients(:,1), "imag", coefficients(:,2) ! ~~~~~~~~~~~~~~ ! Cluster close eigenvalues together to improve stability of the polynomial evaluation @@ -575,6 +579,8 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots ! This places all exactly zero eigenvalues at the end of coefficients call cluster_eigenvalues_stable(coefficients(:, 1), coefficients(:, 2), rel_tol, abs_tol) + print *, "coeffs after cluster", coefficients(:,1), "imag", coefficients(:,2) + ! ~~~~~~~~~~~~~~ ! Extract the non-zero eigenvalues for root adding and leja ordering ! Zero eigenvalues will be appended at the end @@ -613,6 +619,8 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots ! Resize coefficients to hold non-zero roots (with extras) + zero roots at end deallocate(coefficients) + print *, "size(real_roots_added)", size(real_roots_added), "poly_order + 1", poly_order + 1, "numerical_order", numerical_order + print *, "new size", size(real_roots_added) + (poly_order + 1 - numerical_order) allocate(coefficients(size(real_roots_added) + (poly_order + 1 - numerical_order), 2)) coefficients = 0d0 @@ -685,6 +693,8 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots call VecDestroy(V_n(i_loc), ierr) end do call VecDestroy(w_j, ierr) + + print *, "coeffs after leja", coefficients(:,1), "imag", coefficients(:,2) end subroutine calculate_gmres_polynomial_roots_newton @@ -799,11 +809,11 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) if (mat_ctx%imag_roots(size(mat_ctx%real_roots)) == 0d0) then ! Skips eigenvalues that are numerically zero - if (abs(mat_ctx%real_roots(i)) > 1e-12) then + if (abs(mat_ctx%real_roots(size(mat_ctx%real_roots))) > 1e-12) then ! y = y + theta_i * MF_VEC_TEMP call VecAXPBY(y, & - 1d0/mat_ctx%real_roots(i), & + 1d0/mat_ctx%real_roots(size(mat_ctx%real_roots)), & 1d0, & mat_ctx%mf_temp_vec(MF_VEC_TEMP), ierr) end if @@ -1575,12 +1585,12 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! This should now have the value of prod in it vals_previous_power_temp(1:ncols) = vals_power_temp(1:ncols) end do - + ! Final step if last root is real - if (coefficients(term,2) == 0d0) then - if (ncols /= 0 .AND. abs(coefficients(term,1)) > 1e-12) then + if (coefficients(size(coefficients, 1),2) == 0d0) then + if (ncols /= 0 .AND. abs(coefficients(size(coefficients, 1),1)) > 1e-12) then call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & - 1d0/coefficients(term, 1) * vals_power_temp(1:ncols), ADD_VALUES, ierr) + 1d0/coefficients(size(coefficients, 1), 1) * vals_power_temp(1:ncols), ADD_VALUES, ierr) end if end if @@ -1926,8 +1936,8 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly ! Add in the final term multiplied by 1/theta_poly_order ! Skips eigenvalues that are numerically zero - if (abs(coefficients(i,1)) > 1e-12) then - call VecAXPY(inv_vec, 1d0/coefficients(i,1), product_vec, ierr) + if (abs(coefficients(size(coefficients, 1),1)) > 1e-12) then + call VecAXPY(inv_vec, 1d0/coefficients(size(coefficients, 1),1), product_vec, ierr) end if end if @@ -2372,16 +2382,16 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Add in the final term multiplied by 1/theta_poly_order ! Skips eigenvalues that are numerically zero - if (abs(coefficients(i,1)) > 1e-12) then + if (abs(coefficients(i_sparse,1)) > 1e-12) then if (reuse_triggered) then ! If doing reuse we know our nonzeros are a subset - call MatAXPY(inv_matrix, 1d0/coefficients(i,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) + call MatAXPY(inv_matrix, 1d0/coefficients(i_sparse,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) else ! Have to use the DIFFERENT_NONZERO_PATTERN here - call MatAXPYWrapper(inv_matrix, 1d0/coefficients(i,1), mat_product) + call MatAXPYWrapper(inv_matrix, 1d0/coefficients(i_sparse,1), mat_product) end if - if (output_product) status_output(i, 1) = 1 + if (output_product) status_output(i_sparse, 1) = 1 end if end if end if From 10328e9ca2f44e62bbe9cce044e47664654317cb Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 03:05:14 +0000 Subject: [PATCH 31/41] Need to respect contiguous keyword when passing in coefficients below the poly order --- src/Gmres_Poly_Newton.F90 | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 12e50f7..cd37029 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -553,8 +553,6 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots rel_tol = 1.0d0 * sqrt(epsilon(1.0d0)) abs_tol = epsilon(1.0d0) * max(H_norm, beta) - print *, "coeffs", coefficients(:,1), "imag", coefficients(:,2) - ! In some cases with numerical rank deficiency, we can still ! end up with non-zero (or negative) eigenvalues that ! are trivially small - we set them explicitly to zero @@ -565,9 +563,7 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots coefficients(i_loc, 2) = 0d0 end if end do - - print *, "coeffs after zero", coefficients(:,1), "imag", coefficients(:,2) - + ! ~~~~~~~~~~~~~~ ! Cluster close eigenvalues together to improve stability of the polynomial evaluation ! For example when computing the e'vals of a constant diagonal matrix @@ -579,8 +575,6 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots ! This places all exactly zero eigenvalues at the end of coefficients call cluster_eigenvalues_stable(coefficients(:, 1), coefficients(:, 2), rel_tol, abs_tol) - print *, "coeffs after cluster", coefficients(:,1), "imag", coefficients(:,2) - ! ~~~~~~~~~~~~~~ ! Extract the non-zero eigenvalues for root adding and leja ordering ! Zero eigenvalues will be appended at the end @@ -619,8 +613,6 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots ! Resize coefficients to hold non-zero roots (with extras) + zero roots at end deallocate(coefficients) - print *, "size(real_roots_added)", size(real_roots_added), "poly_order + 1", poly_order + 1, "numerical_order", numerical_order - print *, "new size", size(real_roots_added) + (poly_order + 1 - numerical_order) allocate(coefficients(size(real_roots_added) + (poly_order + 1 - numerical_order), 2)) coefficients = 0d0 @@ -692,10 +684,7 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots do i_loc = 1, subspace_size+1 call VecDestroy(V_n(i_loc), ierr) end do - call VecDestroy(w_j, ierr) - - print *, "coeffs after leja", coefficients(:,1), "imag", coefficients(:,2) - + call VecDestroy(w_j, ierr) end subroutine calculate_gmres_polynomial_roots_newton @@ -1091,6 +1080,7 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp logical :: output_first_complex, skip_add PetscReal :: square_sum integer, dimension(poly_order + 1, 2) :: status_output + PetscReal, dimension(poly_sparsity_order+1,2) :: coeffs_contig ! ~~~~~~~~~~ @@ -1158,9 +1148,10 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! Duplicate & copy the matrix, but ensure there is a diagonal present call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, cmat) - + ! Have to be careful to pass in a contiguous piece of memory here + coeffs_contig = coefficients(1:poly_sparsity_order + 1, 1:2) call build_gmres_polynomial_newton_inverse_1st_1st(matrix, 1, & - coefficients(1:poly_sparsity_order + 1, 1:2), & + coeffs_contig, & cmat, mat_sparsity_match, & status_output) end if @@ -1170,8 +1161,9 @@ subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sp ! But we have to be careful because the last root we want to explicitly ! build up to here (ie the power of the matrix given by sparsity_order) ! might be the first root of a complex conjugate pair + coeffs_contig = coefficients(1:poly_sparsity_order + 1, 1:2) call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, & - coefficients(1:poly_sparsity_order + 1, 1:2), & + coeffs_contig, & cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, & status_output, mat_product_save) end if From 1da7e42c76de2cad57c8cb36361b2e2857d8a34b Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 03:40:10 +0000 Subject: [PATCH 32/41] Failures in for loops in tests/Makefile weren't propogating through to the CI --- tests/Makefile | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/Makefile b/tests/Makefile index a1b773d..ed5d6cc 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -134,12 +134,12 @@ run_tests_load_serial: # @echo "" @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders" - @for order in 0 1 2 3 4 5 6; do \ + @set -e; for order in 0 1 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \ done @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders and fixed sparsity" - @for order in 2 3 4 5 6; do \ + @set -e; for order in 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ for sparsity in $$(seq 1 $$(($$order - 1))); do \ echo " --- Testing sparsity order = $$sparsity ---"; \ @@ -200,16 +200,16 @@ run_tests_load_parallel: # @echo "" @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders in parallel" - @for order in 0 1 2 3 4 5 6; do \ + @set -e; for order in 0 1 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \ done @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders and fixed sparsity in parallel" - @for order in 2 3 4 5 6; do \ + @set -e; for order in 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ for sparsity in $$(seq 1 $$(($$order - 1))); do \ echo " --- Testing sparsity order = $$sparsity ---"; \ - $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$sparsity; \ + $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$sparsity; \ done; \ done @@ -437,13 +437,13 @@ run_tests_no_load_serial: # @echo "" @echo "Test Newton AIRG on advection for for different orders" - @for order in 0 1 2 3 4 5 6; do \ + @set -e; for order in 0 1 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ -pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 5; \ done @echo "Test Newton AIRG on advection for for different orders and fixed sparsity" - @for order in 2 3 4 5 6; do \ + @set -e; for order in 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ for sparsity in $$(seq 1 $$(($$order - 1))); do \ echo " --- Testing sparsity order = $$sparsity ---"; \ @@ -659,13 +659,13 @@ run_tests_no_load_parallel: # @echo "" @echo "Test Newton AIRG on advection for for different orders in parallel" - @for order in 0 1 2 3 4 5 6; do \ + @set -e; for order in 0 1 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ $(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ -pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 5; \ done @echo "Test Newton AIRG on advection for for different orders and fixed sparsity in parallel" - @for order in 2 3 4 5 6; do \ + @set -e; for order in 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ for sparsity in $$(seq 1 $$(($$order - 1))); do \ echo " --- Testing sparsity order = $$sparsity ---"; \ @@ -694,7 +694,7 @@ run_tests_medium_serial: # @echo "" - @for size in $$(seq 100 20 200); do \ + @set -e; for size in $$(seq 100 20 200); do \ echo "--- Testing size = $$size x $$size ---"; \ ./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \ -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \ @@ -702,7 +702,7 @@ run_tests_medium_serial: # @echo "" @echo "--- Running scaling study on adv_diff_2d ---" - @for size in 100 200 400 800; do \ + @set -e; for size in 100 200 400 800; do \ echo "--- Testing size = $$size x $$size ---"; \ ./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \ -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \ @@ -714,7 +714,7 @@ run_tests_medium_parallel: # @echo "" - @for size in $$(seq 100 20 200); do \ + @set -e; for size in $$(seq 100 20 200); do \ echo "--- Testing size = $$size x $$size ---"; \ $(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \ -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \ @@ -722,7 +722,7 @@ run_tests_medium_parallel: # @echo "" @echo "--- Running scaling study on adv_diff_2d in parallel ---" - @for size in 100 200 400 800; do \ + @set -e; for size in 100 200 400 800; do \ echo "--- Testing size = $$size x $$size ---"; \ $(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \ -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \ From 30354dc2c3c7144627d4144e868dd23857ac1759 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 14:09:20 +0000 Subject: [PATCH 33/41] Detection of complex conjugate pairs was broken when imaginary parts were very close --- src/Gmres_Poly_Newton.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index cd37029..23c3209 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -2174,7 +2174,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! Check if the distance between the fixed sparsity root and the one before ! If > zero then they are not complex conjugates and hence we are on the first of the pair - if (abs(coefficients(i_sparse,1) - coefficients(i_sparse-1,1))/coefficients(i_sparse,1) > 1e-14 .AND. & + if (abs(coefficients(i_sparse,1) - coefficients(i_sparse-1,1))/coefficients(i_sparse,1) > 1e-14 .OR. & abs(coefficients(i_sparse,2) + coefficients(i_sparse-1,2))/coefficients(i_sparse,2) > 1e-14) then output_first_complex = .TRUE. i_sparse = i_sparse + 1 From 38fba34be009dedcb1015896c96ad42f59fcc844 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 14:54:59 +0000 Subject: [PATCH 34/41] Fix memory leak --- src/Gmres_Poly_Newton.F90 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 23c3209..1761c90 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -2249,6 +2249,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i == i_sparse - 1) then + call MatDestroy(mat_prod_or_temp, ierr) call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if @@ -2282,6 +2283,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i > i_sparse - 2) then + call MatDestroy(mat_prod_or_temp, ierr) call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if @@ -2309,11 +2311,13 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last part of the old product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. i > i_sparse - 2) then + call MatDestroy(mat_prod_or_temp, ierr) call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) ! If i == 1 then we know mat_product is the identity and we don't bother ! to write it out, we just have some custom code in the product given its trivial if (i /= 1) then ! This ensures it has the matching sparsity + call MatDestroy(mat_product_save, ierr) call MatConvert(mat_prod_or_temp, MATSAME, MAT_INITIAL_MATRIX, mat_product_save, ierr) ! This zeros mat_product_save and then puts mat_product into the sparsity pattern ! of mat_prod_or_temp @@ -2354,6 +2358,7 @@ subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coeffi ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply if (output_product .AND. .NOT. first_complex) then + call MatDestroy(mat_prod_or_temp, ierr) call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) end if From 994a0514eff6229d0083cab95d0b96efe2193c71 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 15:33:19 +0000 Subject: [PATCH 35/41] Iteration count change for CI --- tests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Makefile b/tests/Makefile index ed5d6cc..947b1fd 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -440,7 +440,7 @@ run_tests_no_load_serial: @set -e; for order in 0 1 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ - -pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 5; \ + -pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 6; \ done @echo "Test Newton AIRG on advection for for different orders and fixed sparsity" @set -e; for order in 2 3 4 5 6; do \ From 450d9bb76ddbf924104c0e804ece8f904d751b04 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 16:36:36 +0000 Subject: [PATCH 36/41] Intel CI is still broken with power basis, disable power basis comparison in ex12f_gmres_poly in parallel --- dockerfiles/Dockerfile_intel | 3 ++ tests/ex12f_gmres_poly.F90 | 61 +++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/dockerfiles/Dockerfile_intel b/dockerfiles/Dockerfile_intel index ea51ed6..b7a2ae3 100644 --- a/dockerfiles/Dockerfile_intel +++ b/dockerfiles/Dockerfile_intel @@ -28,6 +28,9 @@ RUN source /opt/intel/oneapi/setvars.sh && \ sed -i '/^run_tests_load_parallel:/,/^run_tests_no_load_serial:/s/-pc_pflareinv_type power/-pc_pflareinv_type arnoldi/g' tests/Makefile && \ sed -i '/^run_tests_no_load_parallel:/,/^run_tests_medium_serial:/s/-pc_air_inverse_type power/-pc_air_inverse_type arnoldi/g' tests/Makefile && \ sed -i '/^run_tests_no_load_parallel:/,/^run_tests_medium_serial:/s/-pc_pflareinv_type power/-pc_pflareinv_type arnoldi/g' tests/Makefile && \ + echo "Disabling power basis in parallel test ex12f_gmres_poly in tests/Makefile" && \ + sed -i '/^run_tests_load_parallel:/,/^run_tests_no_load_serial:/s/ex12f_gmres_poly/ex12f_gmres_poly -no_power/g' tests/Makefile && \ + sed -i '/^run_tests_no_load_parallel:/,/^run_tests_medium_serial:/s/ex12f_gmres_poly/ex12f_gmres_poly -no_power/g' tests/Makefile && \ make -j2 && make -j2 check && \ make -j2 tests diff --git a/tests/ex12f_gmres_poly.F90 b/tests/ex12f_gmres_poly.F90 index dc597bb..c0d188e 100644 --- a/tests/ex12f_gmres_poly.F90 +++ b/tests/ex12f_gmres_poly.F90 @@ -9,7 +9,7 @@ program main PetscErrorCode ierr PetscInt m,n,mlocal,nlocal - PetscBool flg + PetscBool flg, check, no_power PetscReal norm_power, norm_rhs, norm_arnoldi, norm_newton PetscReal :: norm_diff_one, norm_diff_two Vec x,b,u, b_diff_type @@ -33,6 +33,12 @@ program main & PETSC_NULL_CHARACTER,'-f',f,flg,ierr) call PetscViewerBinaryOpen(PETSC_COMM_WORLD,f,FILE_MODE_READ, & & fd,ierr) + no_power = PETSC_FALSE + ! Our CI has an intel pipeline and the intel MPI breaks with the power basis + ! so we can disable the power basis test with a command line option + call PetscOptionsGetBool(PETSC_NULL_OPTIONS,PETSC_NULL_CHARACTER, & + '-no_power', check,flg,ierr) + if (flg) no_power = check call MatCreate(PETSC_COMM_WORLD,A,ierr) call MatLoad(A,fd,ierr) @@ -85,16 +91,15 @@ program main call VecNorm(b,NORM_2,norm_rhs,ierr) ! ~~~~~~~~~~~~~ - ! Do a solve with the power basis + ! Do a solve with the Arnoldi basis ! ~~~~~~~~~~~~~ call KSPCreate(PETSC_COMM_WORLD,ksp,ierr) call KSPSetOperators(ksp,A,A,ierr) call KSPGetPC(ksp, pc, ierr) call PCSetType(pc, PCAIR, ierr) - call PCAIRSetInverseType(pc, PFLAREINV_POWER, ierr) + call PCAIRSetInverseType(pc, PFLAREINV_ARNOLDI, ierr) call KSPSetPC(ksp, pc, ierr) call KSPSetFromOptions(ksp,ierr) - call VecSet(x, 0d0, ierr) call KSPSolve(ksp,b,x,ierr) call KSPGetConvergedReason(ksp,reason,ierr) @@ -104,25 +109,27 @@ program main ! Compute the residual call MatMult(A,x,u,ierr) call VecAXPY(u,-1d0,b,ierr) - call VecNorm(u,NORM_2,norm_power,ierr) - norm_power = norm_power/norm_rhs + call VecNorm(u,NORM_2,norm_arnoldi,ierr) + norm_arnoldi = norm_arnoldi/norm_rhs ! ~~~~~~~~~~~~~ - ! Now do a solve with the Arnoldi basis + ! Now do a solve with the Power basis ! ~~~~~~~~~~~~~ - call PCAIRSetInverseType(pc, PFLAREINV_ARNOLDI, ierr) - - call VecSet(x, 0d0, ierr) - call KSPSolve(ksp,b,x,ierr) - call KSPGetConvergedReason(ksp,reason,ierr) - if (reason%v < 0) then - error stop 1 + if (.NOT. no_power) then + call PCAIRSetInverseType(pc, PFLAREINV_POWER, ierr) + + call VecSet(x, 0d0, ierr) + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp,reason,ierr) + if (reason%v < 0) then + error stop 1 + end if + ! Compute the residual + call MatMult(A,x,u,ierr) + call VecAXPY(u,-1d0,b,ierr) + call VecNorm(u,NORM_2,norm_power,ierr) + norm_power = norm_power/norm_rhs end if - ! Compute the residual - call MatMult(A,x,u,ierr) - call VecAXPY(u,-1d0,b,ierr) - call VecNorm(u,NORM_2,norm_arnoldi,ierr) - norm_arnoldi = norm_arnoldi/norm_rhs ! ~~~~~~~~~~~~~ ! Now do a solve with the Newton basis @@ -155,13 +162,15 @@ program main print *, "Arnoldi basis residual: ", norm_arnoldi error stop 1 end if - norm_diff_two = abs(norm_arnoldi - norm_power)/norm_arnoldi - if (norm_diff_two > 1e-9) then - print *, "Residuals differ between polynomial bases!", norm_diff_two - print *, "Power basis residual: ", norm_power - print *, "Arnoldi basis residual: ", norm_arnoldi - error stop 1 - end if + if (.NOT. no_power) then + norm_diff_two = abs(norm_arnoldi - norm_power)/norm_arnoldi + if (norm_diff_two > 1e-9) then + print *, "Residuals differ between polynomial bases!", norm_diff_two + print *, "Power basis residual: ", norm_power + print *, "Arnoldi basis residual: ", norm_arnoldi + error stop 1 + end if + end if call VecDestroy(b,ierr) call VecDestroy(x,ierr) From dd15fcbcda2455567c0e6ecdb4d15696d0a6b21e Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 22:07:51 +0000 Subject: [PATCH 37/41] Update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 13a5cff..2719d0d 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,8 @@ PCPFLAREINV contains methods for computing approximate inverses, most of which c | ------------- | -- | ------------- | -- | | power | PFLAREINV_POWER | GMRES polynomial, applied as a mononomial, with coefficients computed with a power basis | Yes | | arnoldi | PFLAREINV_ARNOLDI | GMRES polynomial, applied as a mononomial, with coefficients computed with an Arnoldi method | Yes | - | newton | PFLAREINV_NEWTON | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with extra roots added for stability | Yes | - | newton_no_extra | PFLAREINV_NEWTON_NO_EXTRA | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with no extra roots added | Yes | + | newton | PFLAREINV_NEWTON | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with extra roots added for stability | Matrix-free: Yes Assembled: No | + | newton_no_extra | PFLAREINV_NEWTON_NO_EXTRA | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with no extra roots added | Matrix-free: Yes Assembled: No | | neumann | PFLAREINV_NEUMANN | Neumann polynomial | Yes | | sai | PFLAREINV_SAI | Sparse approximate inverse | No | | isai | PFLAREINV_ISAI | Incomplete sparse approximate inverse (equivalent to a one-level RAS) | No | From 85a3ab9787c2a7b09cecfe84d965bf62c4a200ea Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 22:09:08 +0000 Subject: [PATCH 38/41] Some of the new tests exposed a bug in the Kokkos GMRES polynomial implementation (unconnected to the Newton form). If using higher-order fixed sparsity, the matching during the fixed sparsity matrix-matrix product was not correct for non-local columns of local rows --- src/Gmres_Polyk.kokkos.cxx | 142 +++++++++++++++++++++++++++++-------- 1 file changed, 111 insertions(+), 31 deletions(-) diff --git a/src/Gmres_Polyk.kokkos.cxx b/src/Gmres_Polyk.kokkos.cxx index 071e646..a1279ee 100644 --- a/src/Gmres_Polyk.kokkos.cxx +++ b/src/Gmres_Polyk.kokkos.cxx @@ -67,14 +67,18 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in Mat *submatrices; // Pull out the nonlocal parts of the input mat we need + const PetscInt *colmap_input_mat; + PetscInt cols_ao_input = 0; if (mpi) { - PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local_input, &mat_nonlocal_input, NULL)); + PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local_input, &mat_nonlocal_input, &colmap_input_mat)); PetscCallVoid(MatMPIAIJGetSeqAIJ(*mat_sparsity_match, &mat_local_sparsity, &mat_nonlocal_sparsity, &colmap_mat_sparsity_match)); PetscCallVoid(MatGetSize(mat_nonlocal_sparsity, &rows_ao, &cols_ao)); PetscCallVoid(MatGetSize(mat_local_sparsity, &rows_ad, &cols_ad)); - + PetscInt rows_ao_input; + PetscCallVoid(MatGetSize(mat_nonlocal_input, &rows_ao_input, &cols_ao_input)); + // We need to pull out all the columns in the sparsity mat // and the nonlocal rows that correspond to the nonlocal columns // from the input mat @@ -175,6 +179,71 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local_output, &device_local_i_output, &device_local_j_output, &device_local_vals_output, &mtype)); if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal_output, &device_nonlocal_i_output, &device_nonlocal_j_output, &device_nonlocal_vals_output, &mtype)); + // ~~~~~~~~~~~~~~ + // Build a mapping from the input matrix's nonlocal column indices to the + // sparsity matrix's column space ("local" submat column space), which is defined as: + // [0..cols_ad-1] for local columns, [cols_ad..cols_ad+cols_ao-1] for sparsity colmap columns + // + // When doing the matrix-matrix product: + // 1. We need to compare local cols from local rows + // We need to access the local input matrix and we + // can do that directly given local indices are the same + // + // 2. We need to compare nonlocal cols from non-local rows + // We need to access submat for this which now only has the non-local rows in it + // Those will have a "local" column index that matches col_indices_off_proc_array given + // we create it with MatCreateSubMatrices + // + // 3. We need to compare nonlocal cols from local rows + // We need to access the input_matrix for this + // But (for higher order fixed sparsity) the colmap of the input matrix is not the same + // as the colmap of the sparsity matrix + // So below we create a mapping that converts from the input matrix's nonlocal column indices + // to the "local" column indices of the submat (which correspond to the sparsity matrix's column space) for the nonlocal columns + // If there are not matching entries in the sparsity colmap, we use a large sentinel value that will never + // match any col_orig and preserves sorted order. + // + // This mapping is needed because the input matrix and sparsity matrix may have + // different colmaps when poly_sparsity_order >= 2. + // ~~~~~~~~~~~~~~ + + // Use a sentinel larger than any valid column index + const PetscInt COLMAP_NOT_FOUND = cols_ad + cols_ao + 1; + + auto input_nonlocal_to_submat_col_d = PetscIntKokkosView("input_nonlocal_to_submat_col_d", mpi ? cols_ao_input : 1); + if (mpi && cols_ao_input > 0) + { + // Build the mapping on the host + // Both colmaps are sorted, so we can do a merge-style scan + auto input_nonlocal_to_submat_col_h = Kokkos::create_mirror_view(input_nonlocal_to_submat_col_d); + PetscInt sparsity_colmap_idx = 0; + for (PetscInt k = 0; k < cols_ao_input; k++) + { + PetscInt global_col = colmap_input_mat[k]; + // Advance the sparsity colmap index (both are sorted) + while (sparsity_colmap_idx < cols_ao && colmap_mat_sparsity_match[sparsity_colmap_idx] < global_col) + { + sparsity_colmap_idx++; + } + if (sparsity_colmap_idx < cols_ao && colmap_mat_sparsity_match[sparsity_colmap_idx] == global_col) + { + input_nonlocal_to_submat_col_h(k) = cols_ad + sparsity_colmap_idx; + } + else + { + // Not found — use sentinel value that preserves sort order + // Since colmap_input is sorted and colmap_sparsity is sorted, + // if an entry is missing it's between two found entries, + // so we assign a value that maintains monotonicity + input_nonlocal_to_submat_col_h(k) = COLMAP_NOT_FOUND; + } + } + Kokkos::deep_copy(input_nonlocal_to_submat_col_d, input_nonlocal_to_submat_col_h); + // Log copy with petsc + bytes = input_nonlocal_to_submat_col_h.extent(0) * sizeof(PetscInt); + PetscCallVoid(PetscLogCpuToGpu(bytes)); + } + // ~~~~~~~~~~~~~~ // Find maximum non-zeros per row for sizing scratch memory // ~~~~~~~~~~~~~~ @@ -358,8 +427,11 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in while (idx_col_of_row_i < ncols_row_i && idx_col_of_row_j < ncols_row_of_col_j) { // The col_target is the column we are trying to match in the row of column j - // We always convert it to the "local" indexing as if it were in the columns of the submat, ie - // the column indexing of [local cols; local cols + 0:cols_ao-1] + // We convert everything to the submat "local" column space for comparison, ie + // the column indexing of [0..cols_ad-1 for local cols; cols_ad+k for sparsity colmap[k]] + // When the input matrix and sparsity matrix have different colmaps + // (poly_sparsity_order >= 2), we use the input_nonlocal_to_submat_col_d mapping + // to convert the input matrix's nonlocal column indices to the sparsity colmap space PetscInt col_target; if (row_of_col_j_local) { @@ -369,8 +441,11 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in } else { - // Convert to "local" column index of submat by adding cols_ad - col_target = device_nonlocal_j_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j] + cols_ad; + // This is the case where we need to access non-local columns in local rows of input_matrix + // and hence we need our mapping + // Convert nonlocal column index from input matrix's colmap space + // to the to "local" column index of submat + col_target = input_nonlocal_to_submat_col_d(device_nonlocal_j_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j]); } } else @@ -390,40 +465,45 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in // Convert to "local" column index of submat by adding cols_ad col_orig = device_nonlocal_j_sparsity[device_nonlocal_i_sparsity[i] + (idx_col_of_row_i - local_cols_row_i)] + cols_ad; } - - if (col_orig < col_target) { - // Original column is smaller, move to next original column - idx_col_of_row_i++; - } else if (col_orig > col_target) { - // Target column is smaller, move to next target column + + // Skip entries where the input column doesn't exist in the sparsity pattern + if (col_target == COLMAP_NOT_FOUND) { idx_col_of_row_j++; - // We've found a matching index and hence we can do our compute } else { - - PetscReal val_target; - if (row_of_col_j_local) - { - if (idx_col_of_row_j < local_cols_row_of_col_j) + if (col_orig < col_target) { + // Original column is smaller, move to next original column + idx_col_of_row_i++; + } else if (col_orig > col_target) { + // Target column is smaller, move to next target column + idx_col_of_row_j++; + // We've found a matching index and hence we can do our compute + } else { + + PetscReal val_target; + if (row_of_col_j_local) { - val_target = device_local_vals_input[device_local_i_input[row_of_col_j] + idx_col_of_row_j]; + if (idx_col_of_row_j < local_cols_row_of_col_j) + { + val_target = device_local_vals_input[device_local_i_input[row_of_col_j] + idx_col_of_row_j]; + } + else + { + val_target = device_nonlocal_vals_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j]; + } } else { - val_target = device_nonlocal_vals_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j]; + val_target = device_submat_vals[device_submat_i[row_of_col_j] + idx_col_of_row_j]; } - } - else - { - val_target = device_submat_vals[device_submat_i[row_of_col_j] + idx_col_of_row_j]; - } - // Has to be atomic! Potentially lots of contention so maybe not - // the most performant way to do this - Kokkos::atomic_add(&vals_temp[idx_col_of_row_i], vals_prev[j] * val_target); + // Has to be atomic! Potentially lots of contention so maybe not + // the most performant way to do this + Kokkos::atomic_add(&vals_temp[idx_col_of_row_i], vals_prev[j] * val_target); - // Move forward in both arrays - idx_col_of_row_i++; - idx_col_of_row_j++; + // Move forward in both arrays + idx_col_of_row_i++; + idx_col_of_row_j++; + } } } }); From 9ddf8ee7f43347147cbbb46061d24a543363a708 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 22:22:18 +0000 Subject: [PATCH 39/41] Iteration count for test CI --- tests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Makefile b/tests/Makefile index 947b1fd..51f7921 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -662,7 +662,7 @@ run_tests_no_load_parallel: @set -e; for order in 0 1 2 3 4 5 6; do \ echo "--- Testing order = $$order ---"; \ $(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ - -pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 5; \ + -pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 6; \ done @echo "Test Newton AIRG on advection for for different orders and fixed sparsity in parallel" @set -e; for order in 2 3 4 5 6; do \ From d8f00222cab6bb079af3ea6a7eaa52ed04177754 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 10 Feb 2026 23:53:41 +0000 Subject: [PATCH 40/41] Test that both matrix-free smoothing and assembled (to compute the restrictor) work with Newton form --- tests/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/Makefile b/tests/Makefile index 51f7921..f86f49c 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -156,6 +156,9 @@ run_tests_load_serial: ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 120 -pc_pflareinv_matrix_free -ksp_norm_type unpreconditioned -ksp_max_it 5 @echo "Test Newton GMRES polynomials order 120 with fixed sparsity with added roots in market matrix problem 1138" ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 120 -ksp_norm_type unpreconditioned -ksp_max_it 5 +# + @echo "Test Newton AIRG with GMRES polynomials for hyperbolic streaming problem, matrix-free smoothing" + ./ex12f -f data/mat_stream_2364 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -pc_air_matrix_free_polys -ksp_max_it 5 # ~~~~~~~~~~~ # ~~~~~~~~~~~ @@ -212,6 +215,9 @@ run_tests_load_parallel: $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$sparsity; \ done; \ done +# + @echo "Test Newton AIRG with GMRES polynomials for hyperbolic streaming problem, matrix-free smoothing in parallel" + $(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -pc_air_matrix_free_polys -ksp_max_it 5 # ~~~~~~~~~~~ # ~~~~~~~~~~~ From eb88540dcdba7eeb754f701be0c5f3e39e1d9ebf Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 11 Feb 2026 00:08:04 +0000 Subject: [PATCH 41/41] Enable diagonal detection for Newton GMRES polynomial --- src/AIR_MG_Setup.F90 | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/AIR_MG_Setup.F90 b/src/AIR_MG_Setup.F90 index f35bdfa..5893e45 100644 --- a/src/AIR_MG_Setup.F90 +++ b/src/AIR_MG_Setup.F90 @@ -426,8 +426,6 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) ! Convert Aff to a matdiagonal type ! Haven't rewritten some inverse types to take advantage of matdiagonal if (aff_diag .AND. & - inverse_type_aff /= PFLAREINV_NEWTON .AND. & - inverse_type_aff /= PFLAREINV_NEWTON_NO_EXTRA .AND. & inverse_type_aff /= PFLAREINV_SAI .AND. & inverse_type_aff /= PFLAREINV_ISAI) then