diff --git a/Makefile b/Makefile index 3fcd41ae..87f39261 100644 --- a/Makefile +++ b/Makefile @@ -138,7 +138,7 @@ OBJS := $(OBJS) $(SRCDIR)/PETSc_Helper.o \ $(SRCDIR)/PCPFLAREINV.o # Define a variable containing all the tests -export TEST_TARGETS = ex12f ex6f ex6f_getcoeffs ex6 adv_1d adv_diff_2d ex6_cf_splitting adv_diff_cg_supg matrandom matrandom_check_reset +export TEST_TARGETS = ex12f ex6f ex6f_getcoeffs ex6 adv_1d adv_diff_2d ex6_cf_splitting adv_diff_cg_supg matrandom matrandom_check_reset ex12f_gmres_poly mat_diag # Include kokkos examples ifeq ($(PETSC_HAVE_KOKKOS),1) export TEST_TARGETS := $(TEST_TARGETS) adv_1dk diff --git a/README.md b/README.md index 13a5cff8..2719d0df 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,8 @@ PCPFLAREINV contains methods for computing approximate inverses, most of which c | ------------- | -- | ------------- | -- | | power | PFLAREINV_POWER | GMRES polynomial, applied as a mononomial, with coefficients computed with a power basis | Yes | | arnoldi | PFLAREINV_ARNOLDI | GMRES polynomial, applied as a mononomial, with coefficients computed with an Arnoldi method | Yes | - | newton | PFLAREINV_NEWTON | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with extra roots added for stability | Yes | - | newton_no_extra | PFLAREINV_NEWTON_NO_EXTRA | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with no extra roots added | Yes | + | newton | PFLAREINV_NEWTON | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with extra roots added for stability | Matrix-free: Yes Assembled: No | + | newton_no_extra | PFLAREINV_NEWTON_NO_EXTRA | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with no extra roots added | Matrix-free: Yes Assembled: No | | neumann | PFLAREINV_NEUMANN | Neumann polynomial | Yes | | sai | PFLAREINV_SAI | Sparse approximate inverse | No | | isai | PFLAREINV_ISAI | Incomplete sparse approximate inverse (equivalent to a one-level RAS) | No | diff --git a/dockerfiles/Dockerfile_intel b/dockerfiles/Dockerfile_intel index ea51ed6b..b7a2ae3c 100644 --- a/dockerfiles/Dockerfile_intel +++ b/dockerfiles/Dockerfile_intel @@ -28,6 +28,9 @@ RUN source /opt/intel/oneapi/setvars.sh && \ sed -i '/^run_tests_load_parallel:/,/^run_tests_no_load_serial:/s/-pc_pflareinv_type power/-pc_pflareinv_type arnoldi/g' tests/Makefile && \ sed -i '/^run_tests_no_load_parallel:/,/^run_tests_medium_serial:/s/-pc_air_inverse_type power/-pc_air_inverse_type arnoldi/g' tests/Makefile && \ sed -i '/^run_tests_no_load_parallel:/,/^run_tests_medium_serial:/s/-pc_pflareinv_type power/-pc_pflareinv_type arnoldi/g' tests/Makefile && \ + echo "Disabling power basis in parallel test ex12f_gmres_poly in tests/Makefile" && \ + sed -i '/^run_tests_load_parallel:/,/^run_tests_no_load_serial:/s/ex12f_gmres_poly/ex12f_gmres_poly -no_power/g' tests/Makefile && \ + sed -i '/^run_tests_no_load_parallel:/,/^run_tests_medium_serial:/s/ex12f_gmres_poly/ex12f_gmres_poly -no_power/g' tests/Makefile && \ make -j2 && make -j2 check && \ make -j2 tests diff --git a/src/AIR_MG_Setup.F90 b/src/AIR_MG_Setup.F90 index 180d0f0d..5893e453 100644 --- a/src/AIR_MG_Setup.F90 +++ b/src/AIR_MG_Setup.F90 @@ -424,7 +424,7 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) end if ! Convert Aff to a matdiagonal type - ! Haven't rewritten sai to take advantage of matdiagonal + ! Haven't rewritten some inverse types to take advantage of matdiagonal if (aff_diag .AND. & inverse_type_aff /= PFLAREINV_SAI .AND. & inverse_type_aff /= PFLAREINV_ISAI) then diff --git a/src/Approx_Inverse_Setup.F90 b/src/Approx_Inverse_Setup.F90 index 35559cab..293f4e03 100644 --- a/src/Approx_Inverse_Setup.F90 +++ b/src/Approx_Inverse_Setup.F90 @@ -70,7 +70,12 @@ subroutine calculate_and_build_approximate_inverse(matrix, inverse_type, & allocate(coefficients(poly_order + 1, 1)) end if else - coefficients => coefficients_stack + if (inverse_type == PFLAREINV_NEWTON .OR. inverse_type == PFLAREINV_NEWTON_NO_EXTRA) then + ! Newton basis needs storage for real and imaginary roots + allocate(coefficients(poly_order + 1, 2)) + else + coefficients => coefficients_stack + end if end if ! This is diabolical - In petsc 3.22, they changed the way to test for @@ -303,13 +308,9 @@ subroutine finish_approximate_inverse(matrix, inverse_type, & ! Gmres polynomial with newton basis else if (inverse_type == PFLAREINV_NEWTON .OR. inverse_type == PFLAREINV_NEWTON_NO_EXTRA) then - if (.NOT. matrix_free) then - print *, "GMRES polynomial with Newton basis must be applied matrix-free" - call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) - end if - call build_gmres_polynomial_newton_inverse(matrix, poly_order, & coefficients, & + inverse_sparsity_order, matrix_free, reuse_mat, reuse_submatrices, & inv_matrix) ! Neumann polynomial diff --git a/src/Gmres_Poly.F90 b/src/Gmres_Poly.F90 index 552f77ac..309b42e3 100644 --- a/src/Gmres_Poly.F90 +++ b/src/Gmres_Poly.F90 @@ -851,7 +851,6 @@ subroutine mat_mult_powers_share_sparsity_cpu(matrix, poly_order, poly_sparsity_ ! Compute matrix powers c = coeff(1) * I + coeff(2) * A + coeff(3) * A^2 + coeff(4) * A^3 + ... ! where a c and the powers all share the same sparsity as the power input in poly_sparsity_order - ! Assuming cmat has not been built/allocated ! This also finishes the async comms required to compute the gmres poly coefficients if buffers%request is allocated ! ~~~~~~~~~~ @@ -1655,7 +1654,7 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity(matrix, poly_order, integer :: order PetscErrorCode :: ierr logical :: reuse_triggered - type(tVec) :: rhs_copy, diag_vec, power_vec + type(tVec) :: inv_vec, diag_vec, power_vec ! ~~~~~~ reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) @@ -1666,9 +1665,9 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity(matrix, poly_order, ! This stores D^order if (.NOT. reuse_triggered) then - call VecDuplicate(diag_vec, rhs_copy, ierr) + call VecDuplicate(diag_vec, inv_vec, ierr) else - call MatDiagonalGetDiagonal(inv_matrix, rhs_copy, ierr) + call MatDiagonalGetDiagonal(inv_matrix, inv_vec, ierr) end if call VecCopy(diag_vec, power_vec, ierr) @@ -1678,22 +1677,22 @@ subroutine build_gmres_polynomial_inverse_0th_order_sparsity(matrix, poly_order, call finish_gmres_polynomial_coefficients_power(poly_order, buffers, coefficients) ! Set: alpha_0 * I - call VecSet(rhs_copy, coefficients(1), ierr) + call VecSet(inv_vec, coefficients(1), ierr) ! Calculate: alpha_0 * I + alpha_1 * D + alpha_2 * D^2 do order = 1, poly_order - call VecAXPY(rhs_copy, coefficients(order+1), power_vec, ierr) + call VecAXPY(inv_vec, coefficients(order+1), power_vec, ierr) ! Compute power_vec = power_vec * D if (order /= poly_order) call VecPointwiseMult(power_vec, power_vec, diag_vec, ierr) end do ! We may be reusing with the same sparsity if (.NOT. reuse_triggered) then - ! The matrix takes ownership of rhs_copy and increases ref counter - call MatCreateDiagonal(rhs_copy, inv_matrix, ierr) - call VecDestroy(rhs_copy, ierr) + ! The matrix takes ownership of inv_vec and increases ref counter + call MatCreateDiagonal(inv_vec, inv_matrix, ierr) + call VecDestroy(inv_vec, ierr) else - call MatDiagonalRestoreDiagonal(inv_matrix, rhs_copy, ierr) + call MatDiagonalRestoreDiagonal(inv_matrix, inv_vec, ierr) end if call VecDestroy(diag_vec, ierr) diff --git a/src/Gmres_Poly_Newton.F90 b/src/Gmres_Poly_Newton.F90 index 87e5f8da..1761c901 100644 --- a/src/Gmres_Poly_Newton.F90 +++ b/src/Gmres_Poly_Newton.F90 @@ -135,6 +135,240 @@ subroutine modified_leja(real_roots, imag_roots, indices) end subroutine modified_leja + ! ------------------------------------------------------------------------------------------------- + + subroutine cluster_eigenvalues_stable(real_roots, imag_roots, rel_tol, abs_tol) + + ! Clustering of (possibly complex) harmonic Ritz values. + ! Numerically distinct clusters are moved to the front. + ! Remaining entries are set to zero. + ! Skips eigenvalues that are exactly zero (both real and imag parts). + ! + ! Inputs: + ! real_roots, imag_roots : eigenvalues (length k) + ! rel_tol : relative tolerance (suggest sqrt(eps) ~ 1e-8) + ! abs_tol : absolute tolerance (suggest eps * ||H|| ~ 1e-15) + ! + ! Outputs: + ! real_roots, imag_roots : clustered eigenvalues at front, zeros after + + PetscReal, dimension(:), intent(inout) :: real_roots, imag_roots + PetscReal, intent(in) :: rel_tol, abs_tol + integer :: i, j, n, n_unique, cluster_size + logical, allocatable :: used(:) + PetscReal :: dist, mag_i, mag_j, scale + PetscReal :: sum_real, sum_imag + PetscReal, allocatable :: rtmp(:), itmp(:) + + n = size(real_roots) + + allocate(used(n)) + allocate(rtmp(n), itmp(n)) + + used = .false. + n_unique = 0 + + ! --------------------------------------------------------- + ! All-pairs clustering + ! --------------------------------------------------------- + do i = 1, n + + if (used(i)) cycle + + ! Skip eigenvalues that are exactly zero + if (real_roots(i) == 0.0d0 .AND. imag_roots(i) == 0.0d0) then + used(i) = .true. + cycle + end if + + ! Start new cluster with eigenvalue i + sum_real = real_roots(i) + sum_imag = imag_roots(i) + cluster_size = 1 + used(i) = .true. + + mag_i = sqrt(real_roots(i)**2 + imag_roots(i)**2) + + ! Look for all other eigenvalues close to this one + do j = i + 1, n + + if (used(j)) cycle + + ! Skip exactly zero eigenvalues + if (real_roots(j) == 0.0d0 .AND. imag_roots(j) == 0.0d0) then + used(j) = .true. + cycle + end if + + mag_j = sqrt(real_roots(j)**2 + imag_roots(j)**2) + + ! Distance between eigenvalues + dist = sqrt((real_roots(j) - real_roots(i))**2 + & + (imag_roots(j) - imag_roots(i))**2) + + ! Use the larger magnitude for relative scaling + scale = max(mag_i, mag_j, 1.0d0) + + ! Check if within tolerance + if (dist <= abs_tol + rel_tol * scale) then + sum_real = sum_real + real_roots(j) + sum_imag = sum_imag + imag_roots(j) + cluster_size = cluster_size + 1 + used(j) = .true. + end if + + end do + + ! Compute cluster centroid (mean) + n_unique = n_unique + 1 + rtmp(n_unique) = sum_real / dble(cluster_size) + itmp(n_unique) = sum_imag / dble(cluster_size) + + end do + + ! --------------------------------------------------------- + ! Output compact form + ! --------------------------------------------------------- + real_roots = 0.0d0 + imag_roots = 0.0d0 + + real_roots(1:n_unique) = rtmp(1:n_unique) + imag_roots(1:n_unique) = itmp(1:n_unique) + + deallocate(used, rtmp, itmp) + + end subroutine cluster_eigenvalues_stable + + + ! ------------------------------------------------------------------------------------------------- + + subroutine compute_extra_roots(real_roots, imag_roots, real_roots_output, imag_roots_output) + + ! Add extra roots for stability + ! Computes the product of factors for each eigenvalue and adds extra copies + ! of roots that have large products (to improve polynomial stability) + ! Only non-zero eigenvalues should be passed in + ! real_roots_output, imag_roots_output are allocated and filled with the original + ! roots plus any extra + + ! ~~~~~~ + PetscReal, dimension(:), intent(inout) :: real_roots, imag_roots + PetscReal, dimension(:), allocatable, intent(inout) :: real_roots_output, imag_roots_output + + ! Local variables + integer :: i_loc, j_loc, k_loc, n_roots, total_extra, counter + PetscReal :: a, b, c, d, div_real, div_imag, div_mag + PetscReal, dimension(size(real_roots)) :: pof + integer, dimension(size(real_roots)) :: extra_pair_roots, overflow + + ! ~~~~~~ + + n_roots = size(real_roots) + + ! Compute the product of factors + pof = 1 + extra_pair_roots = 0 + overflow = 0 + total_extra = 0 + do k_loc = 1, n_roots + + a = real_roots(k_loc) + b = imag_roots(k_loc) + + ! We have already computed pof for the positive imaginary complex conjugate + if (b < 0) cycle + + ! Skips eigenvalues that are numerically zero + if (abs(a) < 1e-12) cycle + if (a**2 + b**2 < 1e-12) cycle + + ! Compute product(k)_{i, j/=i} * | 1 - theta_j/theta_i| + do i_loc = 1, n_roots + + ! Skip + if (k_loc == i_loc) cycle + + c = real_roots(i_loc) + d = imag_roots(i_loc) + + ! Skips eigenvalues that are numerically zero + if (abs(c) < 1e-12) cycle + if (c**2 + d**2 < 1e-12) cycle + + ! theta_k/theta_i + div_real = (a * c + b * d)/(c**2 + d**2) + div_imag = (b * c - a * d)/(c**2 + d**2) + + ! |1 - theta_k/theta_i| + div_mag = sqrt((1 - div_real)**2 + div_imag**2) + + ! Pof is about to overflow, store the exponent and + ! reset pof back to one + ! We can hit this for very high order polynomials, where we have to + ! add more roots than 22 (ie pof > 1e308) + if (log10(pof(k_loc)) + log10(div_mag) > 307) then + overflow(k_loc) = overflow(k_loc) + int(log10(pof(k_loc))) + pof(k_loc) = 1 + end if + + ! Product + pof(k_loc) = pof(k_loc) * div_mag + + end do + + ! If pof > 1e4, we add an extra root, plus one extra for every 1e14 + if (log10(pof(k_loc)) > 4 .OR. overflow(k_loc) /= 0) then + + ! if real extra_pair_roots counts each distinct real root we're adding + ! if imaginary it only counts a pair as one + extra_pair_roots(k_loc) = ceiling((log10(pof(k_loc)) + overflow(k_loc) - 4.0)/14.0) + total_extra = total_extra + extra_pair_roots(k_loc) + + ! If imaginary, the pof is the same for the conjugate, let's just set it to -1 + if (b > 0) then + ! We know the positive imaginary value is first, so the conjugate follows it + pof(k_loc+1) = -1 + ! We need the conjugates as well + total_extra = total_extra + extra_pair_roots(k_loc) + + end if + end if + end do + + ! Allocate output arrays (original roots + extra roots) + allocate(real_roots_output(n_roots + total_extra)) + allocate(imag_roots_output(n_roots + total_extra)) + real_roots_output = 0d0 + imag_roots_output = 0d0 + + ! Copy in original roots + real_roots_output(1:n_roots) = real_roots(1:n_roots) + imag_roots_output(1:n_roots) = imag_roots(1:n_roots) + + ! Add the extra copies of roots, ensuring conjugate pairs we add + ! are next to each other + counter = n_roots + 1 + do i_loc = 1, n_roots + + ! For each extra root pair to add + do j_loc = 1, extra_pair_roots(i_loc) + + real_roots_output(counter) = real_roots(i_loc) + imag_roots_output(counter) = imag_roots(i_loc) + ! Add in the conjugate + if (imag_roots(i_loc) > 0) then + real_roots_output(counter+1) = real_roots(i_loc) + imag_roots_output(counter+1) = -imag_roots(i_loc) + end if + + counter = counter + 1 + if (imag_roots(i_loc) > 0) counter = counter + 1 + end do + end do + + end subroutine compute_extra_roots + + ! ------------------------------------------------------------------------------------------------------------------------------- subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots, coefficients) @@ -160,22 +394,22 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots ! Local variables PetscInt :: global_rows, global_cols, local_rows, local_cols integer :: lwork, subspace_size, rank, i_loc, comm_size, comm_rank, errorcode, iwork_size, j_loc - integer :: total_extra, counter, k_loc, m + integer :: total_extra, counter, k_loc, m, numerical_order PetscErrorCode :: ierr MPIU_Comm :: MPI_COMM_MATRIX PetscReal, dimension(poly_order+2,poly_order+1) :: H_n PetscReal, dimension(poly_order+1,poly_order+2) :: H_n_T - PetscReal, dimension(poly_order+1) :: e_d, solution, s, pof - integer, dimension(poly_order+1) :: extra_pair_roots, overflow + PetscReal, dimension(poly_order+1) :: e_d, solution, s integer, dimension(:), allocatable :: iwork_allocated, indices - PetscReal, dimension(:), allocatable :: work + PetscReal, dimension(:), allocatable :: work, real_roots_added, imag_roots_added + PetscReal, dimension(:), allocatable :: perturbed_real, perturbed_imag PetscReal, dimension(:,:), allocatable :: VL, VR - PetscReal :: beta, div_real, div_imag, a, b, c, d, div_mag + PetscReal :: beta PetscReal, dimension(:, :), allocatable :: coefficients_temp type(tVec) :: w_j type(tVec), dimension(poly_order+2) :: V_n logical :: use_harmonic_ritz = .TRUE. - PetscReal :: rcond = 1e-12 + PetscReal :: rcond = 1e-12, rel_tol, abs_tol, H_norm ! ~~~~~~ @@ -236,8 +470,9 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots ! What we find is that when use this to compute eigenvalues we find e-vals ! as we might expect up to the rank ! but then we have some eigenvalues that are numerically zero - ! We keep those and our application of the newton polynomial in - ! petsc_matvec_gmres_newton_mf and petsc_matvec_gmres_newton_mf_residual + ! Given the way the outside code is structured, we can't lower the poly_order + ! in this routine and return + ! Instead we keep the "zero" eigenvalues our application of the newton polynomial ! just skips them and hence we don't do any ! extra work in the application phase than we would have done with lower order @@ -264,8 +499,8 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots call dgelsd(poly_order + 1, poly_order + 1, 1, H_n_T, size(H_n_T, 1), & e_d, size(e_d), s, rcond, rank, & work, lwork, iwork_allocated, errorcode) - deallocate(work, iwork_allocated) - + deallocate(work, iwork_allocated) + ! Copy in the solution solution = e_d @@ -310,144 +545,146 @@ subroutine calculate_gmres_polynomial_roots_newton(matrix, poly_order, add_roots end if ! ~~~~~~~~~~~~~~ - ! Add roots for stability - ! ~~~~~~~~~~~~~~ - if (add_roots) then - - ! Compute the product of factors - pof = 1 - extra_pair_roots = 0 - overflow = 0 - total_extra = 0 - do k_loc = 1, poly_order + 1 - - a = coefficients(k_loc, 1) - b = coefficients(k_loc, 2) - - ! We have already computed pof for the positive imaginary complex conjugate - if (b < 0) cycle - - ! Skips eigenvalues that are numerically zero - if (abs(a) < 1e-12) cycle - if (a**2 + b**2 < 1e-12) cycle - - ! Compute product(k)_{i, j/=i} * | 1 - theta_j/theta_i| - do i_loc = 1, poly_order + 1 - - ! Skip - if (k_loc == i_loc) cycle - - c = coefficients(i_loc, 1) - d = coefficients(i_loc, 2) + ! Now we have to check the output eigenvalues + ! ~~~~~~~~~~~~~~ - ! Skips eigenvalues that are numerically zero - if (abs(c) < 1e-12) cycle - if (c**2 + d**2 < 1e-12) cycle + ! These are the tolerances that control the clustering + H_norm = norm2(H_n(1:m,1:m)) + rel_tol = 1.0d0 * sqrt(epsilon(1.0d0)) + abs_tol = epsilon(1.0d0) * max(H_norm, beta) + + ! In some cases with numerical rank deficiency, we can still + ! end up with non-zero (or negative) eigenvalues that + ! are trivially small - we set them explicitly to zero + do i_loc = 1, poly_order + 1 + if (coefficients(i_loc,1)**2 + coefficients(i_loc,2)**2 < & + (abs_tol + rel_tol*H_norm)**2) then + coefficients(i_loc, 1) = 0d0 + coefficients(i_loc, 2) = 0d0 + end if + end do + + ! ~~~~~~~~~~~~~~ + ! Cluster close eigenvalues together to improve stability of the polynomial evaluation + ! For example when computing the e'vals of a constant diagonal matrix + ! the rank revealing factorisation above doesn't always report a rank of 1 given roundoff + ! Instead it returns multiple eigenvalues that are very close to each other, + ! and we want to cluster those together and treat them as one root + ! ~~~~~~~~~~~~~~ - ! theta_k/theta_i - div_real = (a * c + b * d)/(c**2 + d**2) - div_imag = (b * c - a * d)/(c**2 + d**2) + ! This places all exactly zero eigenvalues at the end of coefficients + call cluster_eigenvalues_stable(coefficients(:, 1), coefficients(:, 2), rel_tol, abs_tol) - ! |1 - theta_k/theta_i| - div_mag = sqrt((1 - div_real)**2 + div_imag**2) + ! ~~~~~~~~~~~~~~ + ! Extract the non-zero eigenvalues for root adding and leja ordering + ! Zero eigenvalues will be appended at the end + ! ~~~~~~~~~~~~~~ + ! Count the number of non-zero eigenvalues after clustering + numerical_order = 0 + do i_loc = 1, poly_order + 1 + if (coefficients(i_loc, 1) /= 0d0 .OR. coefficients(i_loc, 2) /= 0d0) then + numerical_order = numerical_order + 1 + end if + end do - ! Pof is about to overflow, store the exponent and - ! reset pof back to one - ! We can hit this for very high order polynomials, where we have to - ! add more roots than 22 (ie pof > 1e308) - if (log10(pof(k_loc)) + log10(div_mag) > 307) then - overflow(k_loc) = overflow(k_loc) + int(log10(pof(k_loc))) - pof(k_loc) = 1 - end if + ! ~~~~~~~~~~~~~~ + ! Add roots for stability (only on non-zero eigenvalues) + ! ~~~~~~~~~~~~~~ + if (add_roots .AND. numerical_order > 0) then - ! Product - pof(k_loc) = pof(k_loc) * div_mag + ! Extract non-zero eigenvalues into a temporary array + allocate(coefficients_temp(numerical_order, 2)) + counter = 0 + do i_loc = 1, poly_order + 1 + if (coefficients(i_loc, 1) /= 0d0 .OR. coefficients(i_loc, 2) /= 0d0) then + counter = counter + 1 + coefficients_temp(counter, 1) = coefficients(i_loc, 1) + coefficients_temp(counter, 2) = coefficients(i_loc, 2) + end if + end do + ! Call compute_extra_roots only on the non-zero eigenvalues + ! This allocates real_roots_added/imag_roots_added with the original + extra roots + call compute_extra_roots(coefficients_temp(:, 1), coefficients_temp(:, 2), & + real_roots_added, imag_roots_added) + + ! total number of non-zero roots after adding extras + total_extra = size(real_roots_added) - numerical_order + + ! Resize coefficients to hold non-zero roots (with extras) + zero roots at end + deallocate(coefficients) + allocate(coefficients(size(real_roots_added) + (poly_order + 1 - numerical_order), 2)) + coefficients = 0d0 + + ! Create perturbed copy for leja ordering + allocate(perturbed_real(size(real_roots_added))) + allocate(perturbed_imag(size(real_roots_added))) + perturbed_real = real_roots_added + perturbed_imag = imag_roots_added + + ! Perturb the extra roots so they have unique values for the leja sort + counter = numerical_order + 1 + do i_loc = 1, numerical_order + k_loc = 0 + do j_loc = counter, size(real_roots_added) + ! Check if this extra root matches the original + if (real_roots_added(j_loc) == coefficients_temp(i_loc, 1) .AND. & + abs(imag_roots_added(j_loc)) == abs(coefficients_temp(i_loc, 2))) then + k_loc = k_loc + 1 + perturbed_real(j_loc) = real_roots_added(j_loc) + k_loc * 5e-8 + end if end do + end do - ! If pof > 1e4, we add an extra root, plus one extra for every 1e14 - if (log10(pof(k_loc)) > 4 .OR. overflow(k_loc) /= 0) then + ! Leja order only the non-zero eigenvalues (with extras) + call modified_leja(perturbed_real, perturbed_imag, indices) - ! if real extra_pair_roots counts each distinct real root we're adding - ! if imaginary it only counts a pair as one - extra_pair_roots(k_loc) = ceiling((log10(pof(k_loc)) + overflow(k_loc) - 4.0)/14.0) - total_extra = total_extra + extra_pair_roots(k_loc) + ! Reorder the (non-perturbed) roots using the leja ordering + coefficients(1:size(real_roots_added), 1) = real_roots_added(indices) + coefficients(1:size(real_roots_added), 2) = imag_roots_added(indices) - ! If imaginary, the pof is the same for the conjugate, let's just set it to -1 - if (b > 0) then - ! We know the positive imaginary value is first, so the conjugate follows it - pof(k_loc+1) = -1 - ! We need the conjugates as well - total_extra = total_extra + extra_pair_roots(k_loc) + ! Zero eigenvalues are already zero at the end from the coefficients = 0d0 above - end if - end if - end do + ! Cleanup + deallocate(coefficients_temp, real_roots_added, imag_roots_added) + deallocate(perturbed_real, perturbed_imag, indices) - ! If we have extra roots we need to resize the coefficients storage - if (total_extra > 0) then - allocate(coefficients_temp(size(coefficients, 1), size(coefficients, 2))) - coefficients_temp(1:size(coefficients, 1), 1:size(coefficients, 2)) = coefficients - deallocate(coefficients) - allocate(coefficients(size(coefficients_temp, 1) + total_extra, 2)) - coefficients = 0 - coefficients(1:size(coefficients_temp, 1), :) = coefficients_temp - deallocate(coefficients_temp) - end if - end if + else - ! Take a copy of the existing roots - coefficients_temp = coefficients + ! No root adding - just leja order the non-zero eigenvalues + ! and put zeros at the end + if (numerical_order > 0) then - if (add_roots) then - - ! Add the extra copies of roots, ensuring conjugate pairs we add - ! are next to each other - counter = size(extra_pair_roots)+1 - do i_loc = 1, size(extra_pair_roots) - - ! For each extra root pair to add - do j_loc = 1, extra_pair_roots(i_loc) - - coefficients(counter, :) = coefficients(i_loc, :) - ! Add in the conjugate - if (coefficients(i_loc, 2) > 0) then - coefficients(counter+1, 1) = coefficients(i_loc, 1) - coefficients(counter+1, 2) = -coefficients(i_loc, 2) + ! Extract non-zero eigenvalues + allocate(coefficients_temp(numerical_order, 2)) + counter = 0 + do i_loc = 1, poly_order + 1 + if (coefficients(i_loc, 1) /= 0d0 .OR. coefficients(i_loc, 2) /= 0d0) then + counter = counter + 1 + coefficients_temp(counter, 1) = coefficients(i_loc, 1) + coefficients_temp(counter, 2) = coefficients(i_loc, 2) end if + end do - ! Store a perturbed root so we have unique values for the leja sort below - ! Just peturbing the real value - coefficients_temp(counter, 1) = coefficients(i_loc, 1) + j_loc * 5e-8 - coefficients_temp(counter, 2) = coefficients(i_loc, 2) - ! Add in the conjugate - if (coefficients(i_loc, 2) > 0) then - coefficients_temp(counter+1, 1) = coefficients(i_loc, 1) + j_loc * 5e-8 - coefficients_temp(counter+1, 2) = -coefficients(i_loc, 2) - end if + ! Leja order the non-zero eigenvalues + call modified_leja(coefficients_temp(:, 1), coefficients_temp(:, 2), indices) - counter = counter + 1 - if (coefficients(i_loc, 2) > 0) counter = counter + 1 - end do - end do - end if + ! Reorder and put zeros at the end + coefficients = 0d0 + coefficients(1:numerical_order, 1) = coefficients_temp(indices, 1) + coefficients(1:numerical_order, 2) = coefficients_temp(indices, 2) - ! ~~~~~~~~~~~~~~ - ! Now compute a modified leja ordering for stability - ! ~~~~~~~~~~~~~~ - ! Called with the peturbed extra roots - call modified_leja(coefficients_temp(:,1), coefficients_temp(:,2), indices) + deallocate(coefficients_temp, indices) - ! Reorder the (non-peturbed) roots - coefficients(:,1) = coefficients(indices,1) - coefficients(:,2) = coefficients(indices,2) + end if + + end if ! Cleanup - deallocate(coefficients_temp) do i_loc = 1, subspace_size+1 call VecDestroy(V_n(i_loc), ierr) end do - call VecDestroy(w_j, ierr) + call VecDestroy(w_j, ierr) end subroutine calculate_gmres_polynomial_roots_newton @@ -469,7 +706,7 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) type(tVec) :: y ! Local - integer :: order, errorcode + integer :: i, errorcode PetscErrorCode :: ierr type(mat_ctxtype), pointer :: mat_ctx => null() @@ -487,24 +724,24 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) call VecSet(y, 0d0, ierr) ! ~~~~~~~~~~~~ - ! Iterate over the order + ! Iterate over the i ! ~~~~~~~~~~~~ - order = 1 - do while (order .le. size(mat_ctx%real_roots) - 1) + i = 1 + do while (i .le. size(mat_ctx%real_roots) - 1) ! If real this is easy - if (mat_ctx%imag_roots(order) == 0d0) then + if (mat_ctx%imag_roots(i) == 0d0) then ! Skips eigenvalues that are numerically zero - see ! the comment in calculate_gmres_polynomial_roots_newton - if (abs(mat_ctx%real_roots(order)) < 1e-12) then - order = order + 1 + if (abs(mat_ctx%real_roots(i)) < 1e-12) then + i = i + 1 cycle end if ! y = y + theta_i * MF_VEC_TEMP call VecAXPY(y, & - 1d0/mat_ctx%real_roots(order), & + 1d0/mat_ctx%real_roots(i), & mat_ctx%mf_temp_vec(MF_VEC_TEMP), ierr) ! MF_VEC_DIAG = A * MF_VEC_TEMP @@ -512,10 +749,10 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) call MatMult(mat_ctx%mat, mat_ctx%mf_temp_vec(MF_VEC_TEMP), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) ! MF_VEC_TEMP = MF_VEC_TEMP - theta_i * MF_VEC_DIAG call VecAXPY(mat_ctx%mf_temp_vec(MF_VEC_TEMP), & - -1d0/mat_ctx%real_roots(order), & + -1d0/mat_ctx%real_roots(i), & mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) - order = order + 1 + i = i + 1 ! If imaginary, then have to combine the e'val and its ! complex conjugate to keep the arithmetic real @@ -523,8 +760,8 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) else ! Skips eigenvalues that are numerically zero - if (mat_ctx%real_roots(order)**2 + mat_ctx%imag_roots(order)**2 < 1e-12) then - order = order + 2 + if (mat_ctx%real_roots(i)**2 + mat_ctx%imag_roots(i)**2 < 1e-12) then + i = i + 2 cycle end if @@ -532,27 +769,27 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) call MatMult(mat_ctx%mat, mat_ctx%mf_temp_vec(MF_VEC_TEMP), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) ! MF_VEC_DIAG = 2 * Re(theta_i) * MF_VEC_TEMP - MF_VEC_DIAG call VecAXPBY(mat_ctx%mf_temp_vec(MF_VEC_DIAG), & - 2 * mat_ctx%real_roots(order), & + 2 * mat_ctx%real_roots(i), & -1d0, & mat_ctx%mf_temp_vec(MF_VEC_TEMP), ierr) ! y = y + 1/(Re(theta_i)^2 + Imag(theta_i)^2) * MF_VEC_DIAG call VecAXPY(y, & - 1d0/(mat_ctx%real_roots(order)**2 + mat_ctx%imag_roots(order)**2), & + 1d0/(mat_ctx%real_roots(i)**2 + mat_ctx%imag_roots(i)**2), & mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) - if (order .le. size(mat_ctx%real_roots) - 2) then + if (i .le. size(mat_ctx%real_roots) - 2) then ! MF_VEC_RHS = A * MF_VEC_DIAG call MatMult(mat_ctx%mat, mat_ctx%mf_temp_vec(MF_VEC_DIAG), mat_ctx%mf_temp_vec(MF_VEC_RHS), ierr) ! MF_VEC_TEMP = MF_VEC_TEMP - 1/(Re(theta_i)^2 + Imag(theta_i)^2) * MF_VEC_RHS call VecAXPY(mat_ctx%mf_temp_vec(MF_VEC_TEMP), & - -1d0/(mat_ctx%real_roots(order)**2 + mat_ctx%imag_roots(order)**2), & + -1d0/(mat_ctx%real_roots(i)**2 + mat_ctx%imag_roots(i)**2), & mat_ctx%mf_temp_vec(MF_VEC_RHS), ierr) end if ! Skip two evals - order = order + 2 + i = i + 2 end if end do @@ -561,11 +798,11 @@ subroutine petsc_matvec_gmres_newton_mf(mat, x, y) if (mat_ctx%imag_roots(size(mat_ctx%real_roots)) == 0d0) then ! Skips eigenvalues that are numerically zero - if (abs(mat_ctx%real_roots(order)) > 1e-12) then + if (abs(mat_ctx%real_roots(size(mat_ctx%real_roots))) > 1e-12) then ! y = y + theta_i * MF_VEC_TEMP call VecAXPBY(y, & - 1d0/mat_ctx%real_roots(order), & + 1d0/mat_ctx%real_roots(size(mat_ctx%real_roots)), & 1d0, & mat_ctx%mf_temp_vec(MF_VEC_TEMP), ierr) end if @@ -669,11 +906,737 @@ subroutine petsc_matvec_gmres_newton_mf_residual(mat, x, y) end do end subroutine petsc_matvec_gmres_newton_mf_residual +!------------------------------------------------------------------------------------------------------------------------ + + subroutine mat_mult_powers_share_sparsity_newton(matrix, poly_order, poly_sparsity_order, coefficients, & + reuse_mat, reuse_submatrices, cmat) + + ! Wrapper around mat_mult_powers_share_sparsity_cpu and mat_mult_powers_share_sparsity_kokkos + + ! ~~~~~~~~~~ + ! Input + type(tMat), target, intent(in) :: matrix + integer, intent(in) :: poly_order, poly_sparsity_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: reuse_mat, cmat + type(tMat), dimension(:), pointer, intent(inout) :: reuse_submatrices + +! #if defined(PETSC_HAVE_KOKKOS) +! integer(c_long_long) :: A_array, B_array, reuse_array +! integer :: errorcode, reuse_int_cmat, reuse_int_reuse_mat +! PetscErrorCode :: ierr +! MatType :: mat_type +! Mat :: temp_mat, temp_mat_reuse, temp_mat_compare +! PetscScalar normy; +! logical :: reuse_triggered_cmat, reuse_triggered_reuse_mat +! type(c_ptr) :: coefficients_ptr +! type(tMat) :: reuse_mat_cpu +! type(tMat), dimension(:), pointer :: reuse_submatrices_cpu +! #endif + ! ~~~~~~~~~~ + + ! ~~~~~~~~~~ + ! Special case if we just want to return a gmres polynomial with the sparsity of the diagonal + ! This is like a damped Jacobi + ! ~~~~~~~~~~ +if (poly_sparsity_order == 0) then + + call build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly_order, & + coefficients, cmat) + + return +end if + +! #if defined(PETSC_HAVE_KOKKOS) + +! call MatGetType(matrix, mat_type, ierr) +! if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & +! mat_type == MATAIJKOKKOS) then + +! A_array = matrix%v +! reuse_triggered_cmat = .NOT. PetscObjectIsNull(cmat) +! reuse_triggered_reuse_mat = .NOT. PetscObjectIsNull(reuse_mat) +! reuse_int_cmat = 0 +! if (reuse_triggered_cmat) then +! reuse_int_cmat = 1 +! B_array = cmat%v +! end if +! reuse_int_reuse_mat = 0 +! if (reuse_triggered_reuse_mat) then +! reuse_int_reuse_mat = 1 +! end if +! reuse_array = reuse_mat%v +! coefficients_ptr = c_loc(coefficients) + +! ! call mat_mult_powers_share_sparsity_newton_kokkos(A_array, poly_order, poly_sparsity_order, & +! ! coefficients_ptr, reuse_int_reuse_mat, reuse_array, reuse_int_cmat, B_array) + +! reuse_mat%v = reuse_array +! cmat%v = B_array + +! ! If debugging do a comparison between CPU and Kokkos results +! if (kokkos_debug()) then + +! ! If we're doing reuse and debug, then we have to always output the result +! ! from the cpu version, as it will have coo preallocation structures set +! ! They aren't copied over if you do a matcopy (or matconvert) +! ! If we didn't do that the next time we come through this routine +! ! and try to call the cpu version with reuse, it will segfault +! if (reuse_triggered_cmat) then +! temp_mat = cmat +! call MatConvert(cmat, MATSAME, MAT_INITIAL_MATRIX, temp_mat_compare, ierr) +! else +! temp_mat_compare = cmat +! end if + +! ! Debug check if the CPU and Kokkos versions are the same +! ! We send in an empty reuse_mat_cpu here always, as we can't pass through +! ! the same one Kokkos uses as it now only gets out the non-local rows we need +! ! (ie reuse_mat and reuse_mat_cpu are no longer the same size) +! reuse_submatrices_cpu => null() +! call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & +! coefficients, reuse_mat_cpu, reuse_submatrices_cpu, temp_mat) +! call destroy_matrix_reuse(reuse_mat_cpu, reuse_submatrices_cpu) + +! call MatConvert(temp_mat, MATSAME, MAT_INITIAL_MATRIX, & +! temp_mat_reuse, ierr) + +! call MatAXPYWrapper(temp_mat_reuse, -1d0, temp_mat_compare) +! call MatNorm(temp_mat_reuse, NORM_FROBENIUS, normy, ierr) +! ! There is floating point compute in these inverses, so we have to be a +! ! bit more tolerant to rounding differences +! if (normy .gt. 1d-11 .OR. normy/=normy) then +! !call MatFilter(temp_mat_reuse, 1d-14, PETSC_TRUE, PETSC_FALSE, ierr) +! !call MatView(temp_mat_reuse, PETSC_VIEWER_STDOUT_WORLD, ierr) +! print *, "Kokkos and CPU versions of mat_mult_powers_share_sparsity do not match" + +! call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) +! end if +! call MatDestroy(temp_mat_reuse, ierr) +! if (.NOT. reuse_triggered_cmat) then +! call MatDestroy(cmat, ierr) +! else +! call MatDestroy(temp_mat_compare, ierr) +! end if +! cmat = temp_mat +! end if + +! else + +! call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & +! coefficients, reuse_mat, reuse_submatrices, cmat) + +! end if +! #else + call mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, & + coefficients, reuse_mat, reuse_submatrices, cmat) +!#endif + + ! ~~~~~~~~~~ + + end subroutine mat_mult_powers_share_sparsity_newton + +!------------------------------------------------------------------------------------------------------------------------ + + subroutine mat_mult_powers_share_sparsity_newton_cpu(matrix, poly_order, poly_sparsity_order, coefficients, & + reuse_mat, reuse_submatrices, cmat) + + ! Compute newton powers with the same sparsity + + ! ~~~~~~~~~~ + ! Input + type(tMat), target, intent(in) :: matrix + integer, intent(in) :: poly_order, poly_sparsity_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: reuse_mat, cmat + type(tMat), dimension(:), pointer, intent(inout) :: reuse_submatrices + + PetscInt :: local_rows, local_cols, global_rows, global_cols + PetscInt :: global_row_start, global_row_end_plus_one, row_index_into_submatrix + PetscInt :: global_col_start, global_col_end_plus_one, n, ncols, ncols_two, ifree, max_nnzs + PetscInt :: i_loc, j_loc, row_size, rows_ao, cols_ao, rows_ad, cols_ad, shift = 0, diag_index + integer :: errorcode, match_counter, term + integer :: comm_size + PetscErrorCode :: ierr + integer, dimension(:), allocatable :: cols_index_one, cols_index_two + PetscInt, dimension(:), allocatable :: col_indices_off_proc_array, ad_indices, cols + PetscReal, dimension(:), allocatable :: vals + type(tIS), dimension(1) :: col_indices, row_indices + type(tMat) :: Ad, Ao, mat_sparsity_match, mat_product_save + PetscInt, dimension(:), pointer :: colmap + logical :: deallocate_submatrices = .FALSE. + type(c_ptr) :: vals_c_ptr + type(int_vec), dimension(:), allocatable :: symbolic_ones + type(real_vec), dimension(:), allocatable :: symbolic_vals + integer(c_long_long) A_array + MPIU_Comm :: MPI_COMM_MATRIX + PetscReal, dimension(:), allocatable :: vals_power_temp, vals_previous_power_temp, temp + PetscInt, dimension(:), pointer :: submatrices_ia, submatrices_ja, cols_two_ptr, cols_ptr + PetscReal, dimension(:), pointer :: vals_two_ptr, vals_ptr + real(c_double), pointer :: submatrices_vals(:) + logical :: reuse_triggered + PetscBool :: symmetric = PETSC_FALSE, inodecompressed = PETSC_FALSE, done + PetscInt, parameter :: one = 1, zero = 0 + logical :: output_first_complex, skip_add + PetscReal :: square_sum + integer, dimension(poly_order + 1, 2) :: status_output + PetscReal, dimension(poly_sparsity_order+1,2) :: coeffs_contig + + ! ~~~~~~~~~~ + + if (poly_sparsity_order .ge. size(coefficients)-1) then + print *, "Requested sparsity is greater than or equal to the order" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + + call PetscObjectGetComm(matrix, MPI_COMM_MATRIX, ierr) + ! Get the comm size + call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) + + ! Get the local sizes + call MatGetLocalSize(matrix, local_rows, local_cols, ierr) + call MatGetSize(matrix, global_rows, global_cols, ierr) + ! This returns the global index of the local portion of the matrix + call MatGetOwnershipRange(matrix, global_row_start, global_row_end_plus_one, ierr) + call MatGetOwnershipRangeColumn(matrix, global_col_start, global_col_end_plus_one, ierr) + + reuse_triggered = .NOT. PetscObjectIsNull(cmat) + + ! ~~~~~~~~~~ + ! Compute cmat for all powers up to poly_sparsity_order + ! We have to be more careful here than in the monomial case + ! In the mononomial case we just compute the matrix powers up to poly_sparsity_order + ! and add them times the coefficients to cmat + ! Here though we have to build the Newton basis polynomials + ! As a rule, the value input here into cmat is correct up to the power + ! of poly_sparsity_order, with all other terms being added in the fixed sparsity loops + ! below + ! + ! For complex conjugate roots, two terms are computed at a time, but if the sparsity order + ! falls in between those two roots, only part of the output cmat is included + ! Any remaining terms are output in either mat_sparsity_match or mat_product_save depending + ! on the case + ! mat_sparsity_match has either temp or prod in it depending on the case + ! mat_product_save only exists in some cases and stores prod from the previous term + ! mat_sparsity_match and mat_product_save are always output with the sparsity of sparsity order + ! + ! status_output is an array of length the number of roots + ! and has a 1 in each position if that term has been added to the output + ! It just helps us keep track of what has gone into cmat and what hasn't + ! It breaks up complex conjugate pairs into the first root (i) which has the same power as prod + ! tmp = 2 * a * prod + ! p = p + 1/(a^2 + b^2) * tmp + ! and the second root (i+1) which has the same power as prod * A: + ! tmp = -A * prod + ! p = p + 1/(a^2 + b^2) * tmp + ! ~~~~~~~~~~ + + output_first_complex = .FALSE. + if (poly_sparsity_order == 1) then + + ! If we have a real first coefficient and a second complex + ! we can't call build_gmres_polynomial_newton_inverse_1st_1st as it is only correct + ! for valid coefficients up to 1st order (ie both real or both complex) + if (coefficients(1,2) == 0d0 .AND. coefficients(2,2) /= 0d0) then + + call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & + cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, & + status_output, mat_product_save) + + ! Valid 1st order polynomial, so this case is easy + else + + ! Duplicate & copy the matrix, but ensure there is a diagonal present + call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, cmat) + ! Have to be careful to pass in a contiguous piece of memory here + coeffs_contig = coefficients(1:poly_sparsity_order + 1, 1:2) + call build_gmres_polynomial_newton_inverse_1st_1st(matrix, 1, & + coeffs_contig, & + cmat, mat_sparsity_match, & + status_output) + end if + else + + ! If we're any higher, then we build cmat up to that order + ! But we have to be careful because the last root we want to explicitly + ! build up to here (ie the power of the matrix given by sparsity_order) + ! might be the first root of a complex conjugate pair + coeffs_contig = coefficients(1:poly_sparsity_order + 1, 1:2) + call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, & + coeffs_contig, & + cmat, mat_sparsity_match, poly_sparsity_order, output_first_complex, & + status_output, mat_product_save) + end if + + ! We know we will never have non-zero locations outside of the highest constrained sparsity power + call MatSetOption(cmat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) + call MatSetOption(cmat, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) + ! We know we are only going to insert local vals + ! These options should turn off any reductions in the assembly + call MatSetOption(cmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) + + ! ~~~~~~~~~~~~ + ! If we're in parallel we need to get the off-process rows of matrix that correspond + ! to the columns of mat_sparsity_match + ! We can therefore do the matmult for every constrained power locally with just that data + ! ~~~~~~~~~~~~ + ! Have to double check comm_size /= 1 as we might be on a subcommunicator and we can't call + ! MatMPIAIJGetSeqAIJ specifically if that's the case + if (comm_size /= 1) then + + ! ~~~~ + ! Get the cols + ! ~~~~ + call MatMPIAIJGetSeqAIJ(mat_sparsity_match, Ad, Ao, colmap, ierr) + + call MatGetSize(Ad, rows_ad, cols_ad, ierr) + ! We know the col size of Ao is the size of colmap, the number of non-zero offprocessor columns + call MatGetSize(Ao, rows_ao, cols_ao, ierr) + + ! For the column indices we need to take all the columns of mat_sparsity_match + A_array = mat_sparsity_match%v + + ! These are the global indices of the columns we want + allocate(col_indices_off_proc_array(cols_ad + cols_ao)) + allocate(ad_indices(cols_ad)) + ! Local rows (as global indices) + do ifree = 1, cols_ad + ad_indices(ifree) = global_row_start + ifree - 1 + end do + + ! col_indices_off_proc_array is now sorted, which are the global indices of the columns we want + call merge_pre_sorted(ad_indices, colmap, col_indices_off_proc_array) + deallocate(ad_indices) + + ! Create the sequential IS we want with the cols we want (written as global indices) + call ISCreateGeneral(PETSC_COMM_SELF, cols_ad + cols_ao, & + col_indices_off_proc_array, PETSC_USE_POINTER, col_indices(1), ierr) + call ISCreateGeneral(PETSC_COMM_SELF, cols_ao, & + colmap, PETSC_USE_POINTER, row_indices(1), ierr) + + ! ~~~~~~~ + ! Now we can pull out the chunk of matrix that we need + ! ~~~~~~~ + + ! We need off-processor rows to compute matrix powers + ! Setting this is necessary to avoid an allreduce when calling createsubmatrices + ! This will be reset to false after the call to createsubmatrices + call MatSetOption(matrix, MAT_SUBMAT_SINGLEIS, PETSC_TRUE, ierr) + + ! Now this will be doing comms to get the non-local rows we want + ! But only including the columns of the local fixed sparsity, as we don't need all the + ! columns of the non-local entries unless we are doing a full matmatmult + ! This returns a sequential matrix + if (.NOT. PetscObjectIsNull(reuse_mat)) then + reuse_submatrices(1) = reuse_mat + call MatCreateSubMatrices(matrix, one, row_indices, col_indices, MAT_REUSE_MATRIX, reuse_submatrices, ierr) + else + call MatCreateSubMatrices(matrix, one, row_indices, col_indices, MAT_INITIAL_MATRIX, reuse_submatrices, ierr) + reuse_mat = reuse_submatrices(1) + end if + row_size = size(col_indices_off_proc_array) + call ISDestroy(col_indices(1), ierr) + call ISDestroy(row_indices(1), ierr) + + ! Easy in serial as we have everything we neeed + else + + Ad = mat_sparsity_match + cols_ad = local_cols + allocate(reuse_submatrices(1)) + deallocate_submatrices = .TRUE. + reuse_submatrices(1) = matrix + row_size = local_rows + allocate(col_indices_off_proc_array(local_rows)) + do ifree = 1, local_rows + col_indices_off_proc_array(ifree) = ifree-1 + end do + end if + + ! ~~~~~~~~~ + ! Now that we are here, reuse_submatrices(1) contains A^poly_sparsity_order with all of the rows + ! that correspond to the non-zero columns of matrix + ! ~~~~~~~~~ + + ! Have to get the max nnzs of the local and off-local rows we've just retrieved + max_nnzs = 0 + do ifree = global_row_start, global_row_end_plus_one-1 + call MatGetRow(matrix, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + if (ncols > max_nnzs) max_nnzs = ncols + call MatRestoreRow(matrix, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + end do + if (comm_size /= 1) then + do ifree = 1, cols_ao + call MatGetRow(reuse_submatrices(1), ifree-1, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + if (ncols > max_nnzs) max_nnzs = ncols + call MatRestoreRow(reuse_submatrices(1), ifree-1, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + end do + end if + ! and also the sparsity power + do ifree = global_row_start, global_row_end_plus_one-1 + call MatGetRow(mat_sparsity_match, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + if (ncols > max_nnzs) max_nnzs = ncols + call MatRestoreRow(mat_sparsity_match, ifree, ncols, PETSC_NULL_INTEGER_POINTER, PETSC_NULL_SCALAR_POINTER, ierr) + end do + + ! ~~~~~~~~ + ! Get pointers to the sequential aij structure so we don't have to put critical regions + ! around the matgetrow + ! ~~~~~~~~ + call MatGetRowIJ(reuse_submatrices(1),shift,symmetric,inodecompressed,n,submatrices_ia,submatrices_ja,done,ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJF" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + ! Returns the wrong size pointer and can break if that size goes negative?? + !call MatSeqAIJGetArrayF90(reuse_submatrices(1),submatrices_vals,ierr); + A_array = reuse_submatrices(1)%v + ! Now we must never overwrite the values in this pointer, and we must + ! never call restore on it, see comment on top of the commented out + ! MatSeqAIJRestoreArray below + call MatSeqAIJGetArrayF90_mine(A_array, vals_c_ptr) + call c_f_pointer(vals_c_ptr, submatrices_vals, shape=[size(submatrices_ja)]) + + ! ~~~~~~~~~~ + + allocate(cols(max_nnzs)) + allocate(vals(max_nnzs)) + allocate(vals_power_temp(max_nnzs)) + allocate(vals_previous_power_temp(max_nnzs)) + allocate(temp(max_nnzs)) + allocate(cols_index_one(max_nnzs)) + allocate(cols_index_two(max_nnzs)) + + ! ~~~~~~~~~~~~ + ! From here we now have cmat with the correct values up to the power poly_sparsity_order + ! and hence we want to add in the sparsity constrained powers + ! ~~~~~~~~~~~~ + + ! Now go through and compute the sum of the matrix powers + ! We're doing row-wise matmatmults here assuming the fixed sparsity + ! We exploit the fact that the subsequent matrix powers can be done + ! one row at a time, so we only have to retrieve the needed vals from mat_sparsity_match once + do i_loc = 1, local_rows + + ! Get the row of mat_sparsity_match + call MatGetRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, & + cols_ptr, vals_ptr, ierr) + ! Copying here because mat_sparsity_match and matrix are often the same matrix + ! and hence we can only have one active matgetrow + ncols = ncols_two + cols(1:ncols) = cols_ptr(1:ncols) + vals(1:ncols) = vals_ptr(1:ncols) + call MatRestoreRow(mat_sparsity_match, i_loc - 1 + global_row_start, ncols_two, & + cols_ptr, vals_ptr, ierr) + diag_index = -1 + ! Find the diagonal index in this row + do j_loc = 1, ncols + if (cols(j_loc) == i_loc - 1 + global_row_start) then + diag_index = j_loc + exit + end if + end do + + ! This is just a symbolic for the set of rows given in cols + ! Let's just do all the column matching and extraction of the values once + + ! Allocate some space to store the matching indices + allocate(symbolic_ones(ncols)) + allocate(symbolic_vals(ncols)) + row_index_into_submatrix = 1 + + ! This is a row-wise product + do j_loc = 1, ncols + + ! If we're trying to access a local row in matrix + if (cols(j_loc) .ge. global_row_start .AND. cols(j_loc) < global_row_end_plus_one) then + + call MatGetRow(matrix, cols(j_loc), ncols_two, & + cols_two_ptr, vals_two_ptr, ierr) + + ! If we're trying to access a non-local row in matrix + else + + ! this is local row index we want into reuse_submatrices(1) (as row_indices used to extract are just colmap) + ! We know cols is sorted, so every non-local index will be greater than the last one + ! (it's just that cols could have some local ones between different non-local) + ! colmap is also sorted and we know every single non-local entry in cols(j_loc) is in colmap + do while (row_index_into_submatrix .le. cols_ao .AND. colmap(row_index_into_submatrix) .lt. cols(j_loc)) + row_index_into_submatrix = row_index_into_submatrix + 1 + end do + + ! This is the number of columns + ncols_two = submatrices_ia(row_index_into_submatrix+1) - submatrices_ia(row_index_into_submatrix) + allocate(cols_two_ptr(ncols_two)) + ! This is the local column indices in reuse_submatrices(1) + cols_two_ptr = submatrices_ja(submatrices_ia(row_index_into_submatrix)+1:submatrices_ia(row_index_into_submatrix+1)) + ! Because col_indices_off_proc_array (and hence the column indices in reuse_submatrices(1) is sorted, + ! then cols_two_ptr contains the sorted global column indices + cols_two_ptr = col_indices_off_proc_array(cols_two_ptr+1) + + ! This is the values + vals_two_ptr => & + submatrices_vals(submatrices_ia(row_index_into_submatrix)+1:submatrices_ia(row_index_into_submatrix+1)) + end if + + ! Search for the matching column + ! We're intersecting the global column indices of mat_sparsity_match (cols) and matrix (cols_two_ptr) + call intersect_pre_sorted_indices_only(cols(1:ncols), cols_two_ptr, cols_index_one, cols_index_two, match_counter) + + ! Don't need to do anything if we have no matches + if (match_counter == 0) then + ! Store that we can skip this entry + symbolic_ones(j_loc)%ptr => null() + symbolic_vals(j_loc)%ptr => null() + else + + ! These are the matching local column indices for this row of mat_sparsity_match + allocate(symbolic_ones(j_loc)%ptr(match_counter)) + symbolic_ones(j_loc)%ptr = cols_index_one(1:match_counter) + + ! These are the matching values of matrix + allocate(symbolic_vals(j_loc)%ptr(match_counter)) + symbolic_vals(j_loc)%ptr = vals_two_ptr(cols_index_two(1:match_counter)) + end if + + ! Restore local row of matrix + if (cols(j_loc) .ge. global_row_start .AND. cols(j_loc) < global_row_end_plus_one) then + call MatRestoreRow(matrix, cols(j_loc), ncols_two, & + cols_two_ptr, vals_two_ptr, ierr) + else + deallocate(cols_two_ptr) + end if + end do + + ! Start with the values of mat_sparsity_match in it + vals_previous_power_temp(1:ncols) = vals(1:ncols) + + ! Loop over any matrix powers + ! vals_power_temp stores the prod for this row, and we update this as we go through + ! the term loop + term = poly_sparsity_order + 1 + skip_add = .FALSE. + ! If the fixed sparsity root is the second of a complex pair, we start one term earlier + ! so that we can compute the correct part of the fixed sparsity product, we just make sure not to add + ! anything to cmat as it is already correct up to the fixed sparsity order + if (coefficients(term,2) /= 0d0 .AND. .NOT. output_first_complex) then + term = term - 1 + skip_add = .TRUE. + end if + + ! This loop skips the last coefficient + do while (term .le. size(coefficients, 1) - 1) + + ! If real + if (coefficients(term,2) == 0d0) then + + ! Skips eigenvalues that are numerically zero - see + ! the comment in calculate_gmres_polynomial_roots_newton + if (abs(coefficients(term,1)) < 1e-12) then + term = term + 1 + cycle + end if + + ! ~~~~~~~~~~~ + ! Now can add the value to our matrix + ! ~~~~~~~~~~~ + if (ncols /= 0 .AND. status_output(term, 1) /= 1) then + call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & + 1d0/coefficients(term, 1) * vals_previous_power_temp(1:ncols), ADD_VALUES, ierr) + end if + + ! Initialize with previous product before the A*prod subtraction + vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols) + + ! This is the 1/theta_i * A * prod but where A * prod has fixed sparsity + do j_loc = 1, ncols + + ! If we have no matching columns cycle this row + if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle + + ! symbolic_vals(j_loc)%ptr has the matching values of A in it + vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) - & + 1d0/coefficients(term, 1) * & + symbolic_vals(j_loc)%ptr * vals_previous_power_temp(j_loc) + end do + + term = term + 1 + + ! If complex + else + + ! Skips eigenvalues that are numerically zero - see + ! the comment in calculate_gmres_polynomial_roots_newton + if (coefficients(term,1)**2 + coefficients(term,2)**2 < 1e-12) then + term = term + 2 + cycle + end if + + square_sum = 1d0/(coefficients(term,1)**2 + coefficients(term,2)**2) + + ! If our fixed sparsity order falls on the first of a complex conjugate pair + if (.NOT. skip_add) then + + ! We skip the 2 * a * prod from the first root of a complex pair if that has already + ! been included in the cmat from build_gmres_polynomial_newton_inverse_full + if (status_output(term, 2) /= 1) then + temp(1:ncols) = 2 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) + else + temp(1:ncols) = 0d0 + end if + + ! This is the -A * prod + do j_loc = 1, ncols + + ! If we have no matching columns cycle this row + if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle + + ! symbolic_vals(j_loc)%ptr has the matching values of A in it + temp(symbolic_ones(j_loc)%ptr) = temp(symbolic_ones(j_loc)%ptr) - & + symbolic_vals(j_loc)%ptr * vals_previous_power_temp(j_loc) + end do + + ! This is the p = p + 1/(a^2 + b^2) * temp + if (ncols /= 0) then + call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & + square_sum * temp(1:ncols), ADD_VALUES, ierr) + end if + + ! Here we need to go back in and ensure 2 *a * prod is in temp if we skipped it + ! above. We know it is already in cmat, but it has to be in temp when we + ! do the next product + if (status_output(term, 2) == 1) then + if (output_first_complex) then + temp(1:ncols) = temp(1:ncols) + 2d0 * coefficients(term, 1) * vals_previous_power_temp(1:ncols) + end if + end if + + ! If our fixed sparsity order falls on the second of a complex conjugate pair + else + + ! In this case we have already included both 2*a*prod - A * prod into cmat + ! But we still have to compute the product for the next term + ! The problem here is that mat_sparsity_match has temp in it in this case, not + ! the old prod from whatever the previous loop is + ! In that case build_gmres_polynomial_newton_inverse_full also outputs + ! mat_product_save which is the old value of prod but with the sparsity of + ! mat_sparsity_match (with zeros if needed) + + ! This case only occurs once for each row, so once we've hit this + ! we will always have our correct prod + skip_add = .FALSE. + ! temp is output into mat_sparsity_match in this case + temp(1:ncols) = vals_previous_power_temp(1:ncols) + + ! If sparsity order is 1, the previous product will have been the identity + ! and we don't output it into mat_product_save because that is a trivial case + ! we can do ourselves + if (term == 1) then + vals_previous_power_temp(1:ncols) = 0d0 + if (diag_index /= -1) then + vals_previous_power_temp(diag_index) = 1d0 + end if + + ! In the case the mat_product_save is not the identity, we need to pull it's value out + else + + call MatGetRow(mat_product_save, i_loc - 1 + global_row_start, ncols_two, & + cols_two_ptr, vals_two_ptr, ierr) + + ! We have guaranteed in the build_gmres_polynomial_newton_inverse_full + ! version that mat_product_save has fixed sparsity + vals_previous_power_temp(1:ncols_two) = vals_two_ptr(1:ncols_two) + + call MatRestoreRow(mat_product_save, i_loc - 1 + global_row_start, ncols_two, & + cols_two_ptr, vals_two_ptr, ierr) + end if + end if + + ! Now we compute the next product + if (term .le. size(coefficients, 1)- 2) then + + vals_power_temp(1:ncols) = vals_previous_power_temp(1:ncols) + + ! This is prod = prod - 1/(a^2 + b^2) * A * temp + do j_loc = 1, ncols + + ! If we have no matching columns cycle this row + if (.NOT. associated(symbolic_ones(j_loc)%ptr)) cycle + + ! symbolic_vals(j_loc)%ptr has the matching values of A in it + vals_power_temp(symbolic_ones(j_loc)%ptr) = vals_power_temp(symbolic_ones(j_loc)%ptr) - & + square_sum * & + symbolic_vals(j_loc)%ptr * temp(j_loc) + end do + end if + + term = term + 2 + + end if + + ! This should now have the value of prod in it + vals_previous_power_temp(1:ncols) = vals_power_temp(1:ncols) + end do + + ! Final step if last root is real + if (coefficients(size(coefficients, 1),2) == 0d0) then + if (ncols /= 0 .AND. abs(coefficients(size(coefficients, 1),1)) > 1e-12) then + call MatSetValues(cmat, one, [global_row_start + i_loc-1], ncols, cols, & + 1d0/coefficients(size(coefficients, 1), 1) * vals_power_temp(1:ncols), ADD_VALUES, ierr) + end if + end if + + ! ~~~~~~~~~~~~~~~ + + do j_loc = 1, ncols + if (associated(symbolic_ones(j_loc)%ptr)) then + deallocate(symbolic_ones(j_loc)%ptr) + deallocate(symbolic_vals(j_loc)%ptr) + end if + end do + deallocate(symbolic_vals, symbolic_ones) + end do + + call MatRestoreRowIJ(reuse_submatrices(1),shift,symmetric,inodecompressed,n,submatrices_ia,submatrices_ja,done,ierr) + ! We very deliberately don't call restorearray here! + ! There is no matseqaijgetarrayread or matseqaijrestorearrayread in Fortran + ! Those routines don't increment the PetscObjectStateGet which tells petsc + ! the mat has changed. Hence above we directly access the data pointer with + ! a call to MatSeqAIJGetArrayF90_mine and then never write into it + ! If we call the restorearrayf90, that does increment the object state + ! even though we only read from the array + ! That would mean if we pass in a pc->pmat for example, just setting up a pc + ! would trigger petsc setting up the pc on every iteration of the pc + ! call MatSeqAIJRestoreArray(reuse_submatrices(1),submatrices_vals,ierr); + + ! ~~~~~~~~~~~ + + ! Do the assembly, should need zero reductions in this given we've set the + ! flags above + call MatAssemblyBegin(cmat, MAT_FINAL_ASSEMBLY, ierr) + + ! Delete temporaries + call MatDestroy(mat_sparsity_match, ierr) + call MatDestroy(mat_product_save, ierr) + if (deallocate_submatrices) then + deallocate(reuse_submatrices) + reuse_submatrices => null() + end if + + deallocate(col_indices_off_proc_array) + deallocate(cols, vals, vals_power_temp, vals_previous_power_temp, temp, cols_index_one, cols_index_two) + + ! Finish assembly + call MatAssemblyEnd(cmat, MAT_FINAL_ASSEMBLY, ierr) + + + end subroutine mat_mult_powers_share_sparsity_newton_cpu ! ------------------------------------------------------------------------------------------------------------------------------- subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & coefficients, & + poly_sparsity_order, matrix_free, reuse_mat, reuse_submatrices, & inv_matrix) ! Builds a matrix which is an approximation to the inverse of a matrix using the @@ -684,7 +1647,10 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & type(tMat), intent(in) :: matrix integer, intent(in) :: poly_order PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients - type(tMat), intent(inout) :: inv_matrix + integer, intent(in) :: poly_sparsity_order + logical, intent(in) :: matrix_free + type(tMat), intent(inout) :: reuse_mat, inv_matrix + type(tMat), dimension(:), pointer, intent(inout) :: reuse_submatrices ! Local variables PetscInt :: global_rows, global_cols, local_rows, local_cols @@ -692,6 +1658,7 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & PetscErrorCode :: ierr MPIU_Comm :: MPI_COMM_MATRIX type(mat_ctxtype), pointer :: mat_ctx=>null() + logical :: reuse_triggered ! ~~~~~~ @@ -708,47 +1675,728 @@ subroutine build_gmres_polynomial_newton_inverse(matrix, poly_order, & ! ~~~~~~~ ! Just build a matshell that applies our polynomial matrix-free ! ~~~~~~~ + if (matrix_free) then + + ! If not re-using + if (PetscObjectIsNull(inv_matrix)) then + + ! Have to dynamically allocate this + allocate(mat_ctx) + + ! We pass in the polynomial coefficients as the context + call MatCreateShell(MPI_COMM_MATRIX, local_rows, local_cols, global_rows, global_cols, & + mat_ctx, inv_matrix, ierr) + ! The subroutine petsc_matvec_gmres_newton_mf applies the polynomial inverse + call MatShellSetOperation(inv_matrix, & + MATOP_MULT, petsc_matvec_gmres_newton_mf, ierr) + + call MatAssemblyBegin(inv_matrix, MAT_FINAL_ASSEMBLY, ierr) + call MatAssemblyEnd(inv_matrix, MAT_FINAL_ASSEMBLY, ierr) + ! Have to make sure to set the type of vectors the shell creates + call ShellSetVecType(matrix, inv_matrix) + + ! Create temporary vectors we use during application + ! Make sure to use matrix here to get the right type (as the shell doesn't know about gpus) + call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_TEMP), PETSC_NULL_VEC, ierr) + call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_RHS), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) + + ! Reusing + else + call MatShellGetContext(inv_matrix, mat_ctx, ierr) + + end if + + mat_ctx%real_roots => coefficients(:, 1) + mat_ctx%imag_roots => coefficients(:, 2) + ! Now because the context reset deallocates the coefficient pointer + ! we want to make sure we don't leak memory, so we use pointer remapping here + ! to turn the 2D coefficient pointer into a 1D that we can store in mat_ctx%coefficients + ! and then the deallocate on mat_ctx%coefficients should still delete all the memory + mat_ctx%coefficients(1:2*size(coefficients,1)) => coefficients(:, :) + ! This is the matrix whose inverse we are applying (just copying the pointer here) + mat_ctx%mat = matrix + + ! We're done + return + endif + + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + + ! ~~~~~~~~~~~~ + ! If we're here then we want an assembled approximate inverse + ! ~~~~~~~~~~~~ + + ! If we're zeroth order poly this is trivial as it's just 1/theta_1 I + if (poly_order == 0) then + + call build_gmres_polynomial_newton_inverse_0th_order(matrix, poly_order, coefficients, & + inv_matrix) + + ! Then just return + return + + ! For poly_order 1 and poly_sparsity_order 1 this is easy + else if (poly_order == 1 .AND. poly_sparsity_order == 1) then + + ! Duplicate & copy the matrix, but ensure there is a diagonal present + call mat_duplicate_copy_plus_diag(matrix, reuse_triggered, inv_matrix) + + call build_gmres_polynomial_newton_inverse_1st_1st(matrix, & + poly_order, coefficients, inv_matrix) + + ! Then just return + return + + end if + + ! If we're constraining sparsity we've built a custom matrix-powers that assumes fixed sparsity + if (poly_sparsity_order < poly_order) then + + ! This routine is a custom one that builds our matrix powers and assumes fixed sparsity + ! so that it doen't have to do much comms + call mat_mult_powers_share_sparsity_newton(matrix, poly_order, poly_sparsity_order, coefficients, & + reuse_mat, reuse_submatrices, inv_matrix) + + ! ! Then just return + return + + end if + + ! ~~~~~~~~~~ + ! We are only here if we don't constrain_sparsity + ! ~~~~~~~~~~ + call build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & + inv_matrix) + + + end subroutine build_gmres_polynomial_newton_inverse + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine build_gmres_polynomial_newton_inverse_0th_order(matrix, poly_order, coefficients, & + inv_matrix) + + ! Specific 0th order inverse + + ! ~~~~~~ + type(tMat), intent(in) :: matrix + integer, intent(in) :: poly_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: inv_matrix + + ! Local variables + integer :: errorcode + PetscErrorCode :: ierr + logical :: reuse_triggered + type(tVec) :: diag_vec + + ! ~~~~~~ + + if (poly_order /= 0) then + print *, "This is a 0th order inverse, but poly_order is not 0" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + + ! Let's create a matrix to represent the inverse diagonal + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + + if (.NOT. reuse_triggered) then + call MatCreateVecs(matrix, PETSC_NULL_VEC, diag_vec, ierr) + else + call MatDiagonalGetDiagonal(inv_matrix, diag_vec, ierr) + end if + + ! Must be real as we only have one coefficient + call VecSet(diag_vec, 1d0/coefficients(1, 1), ierr) + + ! We may be reusing with the same sparsity + if (.NOT. reuse_triggered) then + ! The matrix takes ownership of diag_vec and increases ref counter + call MatCreateDiagonal(diag_vec, inv_matrix, ierr) + call VecDestroy(diag_vec, ierr) + else + call MatDiagonalRestoreDiagonal(inv_matrix, diag_vec, ierr) + end if + + end subroutine build_gmres_polynomial_newton_inverse_0th_order + + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton(matrix, poly_order, coefficients, & + inv_matrix) + + ! Specific inverse with 0th order sparsity + + ! ~~~~~~ + type(tMat), intent(in) :: matrix + integer, intent(in) :: poly_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: inv_matrix + + ! Local variables + integer :: i + PetscErrorCode :: ierr + logical :: reuse_triggered + type(tVec) :: inv_vec, diag_vec, product_vec, temp_vec_A, one_vec, temp_vec_two + ! ~~~~~~ + + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + + ! Our matrix has to be square + call MatCreateVecs(matrix, product_vec, diag_vec, ierr) + call MatGetDiagonal(matrix, diag_vec, ierr) + + if (.NOT. reuse_triggered) then + call VecDuplicate(diag_vec, inv_vec, ierr) + else + call MatDiagonalGetDiagonal(inv_matrix, inv_vec, ierr) + end if + call VecDuplicate(diag_vec, temp_vec_A, ierr) + call VecDuplicate(diag_vec, one_vec, ierr) + call VecDuplicate(diag_vec, temp_vec_two, ierr) + + ! Set to zero as we add to it + call VecSet(inv_vec, 0d0, ierr) + ! We start with an identity in product_vec + call VecSet(product_vec, 1d0, ierr) + call VecSet(one_vec, 1d0, ierr) + + i = 1 + do while (i .le. size(coefficients, 1) - 1) + + ! temp_vec_A is going to store things with the sparsity of A + call VecCopy(diag_vec, temp_vec_A, ierr) + + ! If real this is easy + if (coefficients(i,2) == 0d0) then + + ! Skips eigenvalues that are numerically zero - see + ! the comment in calculate_gmres_polynomial_roots_newton + if (abs(coefficients(i,1)) < 1e-12) then + i = i + 1 + cycle + end if + + call VecAXPY(inv_vec, 1d0/coefficients(i,1), product_vec, ierr) + + ! temp_vec_A = A_ff/theta_k + call VecScale(temp_vec_A, -1d0/coefficients(i,1), ierr) + ! temp_vec_A = I - A_ff/theta_k + call VecAXPY(temp_vec_A, 1d0, one_vec, ierr) + + ! product_vec = product_vec * temp_vec_A + call VecPointwiseMult(product_vec, product_vec, temp_vec_A, ierr) + + i = i + 1 + + ! Complex + else + + ! Skips eigenvalues that are numerically zero + if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then + i = i + 2 + cycle + end if + + ! Compute 2a I - A + ! temp_vec_A = -A + call VecScale(temp_vec_A, -1d0, ierr) + ! temp_vec_A = 2a I - A_ff + call VecAXPY(temp_vec_A, 2d0 * coefficients(i,1), one_vec, ierr) + ! temp_vec_A = (2a I - A_ff)/(a^2 + b^2) + call VecScale(temp_vec_A, 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2), ierr) + + ! temp_vec_two = temp_vec_A * product_vec + call VecPointwiseMult(temp_vec_two, temp_vec_A, product_vec, ierr) + call VecAXPY(inv_vec, 1d0, temp_vec_two, ierr) + + if (i .le. size(coefficients, 1) - 2) then + ! temp_vec_two = A * temp_vec_two + call VecPointwiseMult(temp_vec_two, diag_vec, temp_vec_two, ierr) + call VecAXPY(product_vec, -1d0, temp_vec_two, ierr) + end if + + ! Skip two evals + i = i + 2 + + end if + end do + + ! Final step if last root is real + if (coefficients(size(coefficients,1),2) == 0d0) then + ! Add in the final term multiplied by 1/theta_poly_order + + ! Skips eigenvalues that are numerically zero + if (abs(coefficients(size(coefficients, 1),1)) > 1e-12) then + call VecAXPY(inv_vec, 1d0/coefficients(size(coefficients, 1),1), product_vec, ierr) + end if + end if + + ! We may be reusing with the same sparsity + if (.NOT. reuse_triggered) then + ! The matrix takes ownership of inv_vec and increases ref counter + call MatCreateDiagonal(inv_vec, inv_matrix, ierr) + call VecDestroy(inv_vec, ierr) + else + call MatDiagonalRestoreDiagonal(inv_matrix, inv_vec, ierr) + end if + + call VecDestroy(diag_vec, ierr) + call VecDestroy(product_vec, ierr) + call VecDestroy(temp_vec_A, ierr) + call VecDestroy(one_vec, ierr) + call VecDestroy(temp_vec_two, ierr) + + end subroutine build_gmres_polynomial_inverse_0th_order_sparsity_newton + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine build_gmres_polynomial_newton_inverse_1st_1st(matrix, poly_order, coefficients, & + inv_matrix, mat_prod_or_temp, status_output) + + ! Specific 1st order with 1st order sparsity + + ! ~~~~~~ + type(tMat), intent(in) :: matrix + integer, intent(in) :: poly_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: inv_matrix + type(tMat), intent(inout), optional :: mat_prod_or_temp + integer, dimension(:, :), intent(inout), optional :: status_output + + ! Local variables + PetscErrorCode :: ierr + logical :: reuse_triggered, output_product + PetscReal :: square_sum + + ! ~~~~~~ + + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + output_product = present(mat_prod_or_temp) - ! If not re-using - if (PetscObjectIsNull(inv_matrix)) then + ! Flags to prevent reductions when assembling (there are assembles in the shift) + call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) + call MatSetOption(inv_matrix, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr) + call MatSetOption(inv_matrix, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr) - ! Have to dynamically allocate this - allocate(mat_ctx) + if (output_product) then + status_output = 0 + end if - ! We pass in the polynomial coefficients as the context - call MatCreateShell(MPI_COMM_MATRIX, local_rows, local_cols, global_rows, global_cols, & - mat_ctx, inv_matrix, ierr) - ! The subroutine petsc_matvec_gmres_newton_mf applies the polynomial inverse - call MatShellSetOperation(inv_matrix, & - MATOP_MULT, petsc_matvec_gmres_newton_mf, ierr) + ! We only have two coefficients, so they are either both real or complex conjugates + ! If real + if (coefficients(1,2) == 0d0) then + + ! Have to be careful here, as we may be first order, but the second eigenvaule + ! might have been set to zero thanks to the rank reducing solve + ! So we just check if the second real part is zero and if it is + ! we just compute a 0th order inverse - annoyingly we can't call + ! build_gmres_polynomial_newton_inverse_0th_order as that builds a MATDIAGONAL + ! and in the tests there is a problem where we reuse the sparsity, in the first + ! solve we don't have a zero coefficient but in the second solve we do + ! So the mat type needs to remain consistent + ! This can't happen in the complex case + if (abs(coefficients(2,1)) < 1e-12) then + + ! Set to zero + call MatScale(inv_matrix, 0d0, ierr) + + ! Tricky case here as we want to pass out the identity with the + ! sparsity of A + if (output_product) then + call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + call MatShift(mat_prod_or_temp, 1d0, ierr) + status_output(1:2, 1) = 1 + end if + + ! Then add in the 0th order inverse + call MatShift(inv_matrix, 1d0/coefficients(1,1), ierr) + + ! Then just return + return + end if - call MatAssemblyBegin(inv_matrix, MAT_FINAL_ASSEMBLY, ierr) - call MatAssemblyEnd(inv_matrix, MAT_FINAL_ASSEMBLY, ierr) - ! Have to make sure to set the type of vectors the shell creates - call ShellSetVecType(matrix, inv_matrix) + ! Could just compute the equivalent mononomial here to save some flops + ! but the whole point to doing the Newton form is to avoid the + ! theta_1 * theta_2 that would result + + ! result = -A_ff/theta_1 + call MatScale(inv_matrix, -1d0/(coefficients(1, 1)), ierr) + ! result = I -A_ff/theta_1 + call MatShift(inv_matrix, 1d0, ierr) + ! If we're doing this as part of fixed sparsity multiply, + ! we need to return mat_prod_or_temp + if (output_product) then + call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + end if + + ! result = 1/theta_2 * (I -A_ff/theta_1) + call MatScale(inv_matrix, 1d0/(coefficients(2, 1)), ierr) + + ! result = 1/theta_1 + 1/theta_2 * (I -A_ff/theta_1) + ! Don't need an assemble as there is one called in this + call MatShift(inv_matrix, 1d0/(coefficients(1, 1)), ierr) - ! Create temporary vectors we use during application - ! Make sure to use matrix here to get the right type (as the shell doesn't know about gpus) - call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_TEMP), PETSC_NULL_VEC, ierr) - call MatCreateVecs(matrix, mat_ctx%mf_temp_vec(MF_VEC_RHS), mat_ctx%mf_temp_vec(MF_VEC_DIAG), ierr) + if (output_product) then + status_output(1:2, 1) = 1 + end if - ! Reusing + ! Complex conjugate roots, a +- ib else - call MatShellGetContext(inv_matrix, mat_ctx, ierr) + ! a^2 + b^2 + square_sum = coefficients(1,1)**2 + coefficients(1,2)**2 + + ! Complex conjugate roots + ! result = -A_ff + call MatScale(inv_matrix, -1d0, ierr) + ! result = 2a I - A_ff + ! Don't need an assemble as there is one called in this + call MatShift(inv_matrix, 2d0 * coefficients(1,1), ierr) + ! If we're doing this as part of fixed sparsity multiply, + ! we need to return mat_prod_or_temp + if (output_product) then + call MatConvert(inv_matrix, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + end if + ! result = 2a I - A_ff/(a^2 + b^2) + call MatScale(inv_matrix, 1d0/square_sum, ierr) + + if (output_product) then + status_output(1:2, 2) = 1 + end if + end if + + end subroutine build_gmres_polynomial_newton_inverse_1st_1st + + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine build_gmres_polynomial_newton_inverse_full(matrix, poly_order, coefficients, & + inv_matrix, mat_prod_or_temp, poly_sparsity_order, output_first_complex, & + status_output, mat_product_save) + + ! No constrained sparsity by default + ! If you pass in mat_prod_or_temp, poly_sparsity_order, output_first_complex, status_output and + ! mat_product_save + ! then it will build part of the terms, up to poly_sparsity_order, and return the product + ! in mat_prod_or_temp and mat_product_save that you need to compute the rest of the fixed sparsity terms + + ! ~~~~~~ + type(tMat), intent(in) :: matrix + integer, intent(in) :: poly_order + PetscReal, dimension(:, :), target, contiguous, intent(inout) :: coefficients + type(tMat), intent(inout) :: inv_matrix + type(tMat), intent(inout), optional :: mat_prod_or_temp, mat_product_save + integer, intent(in), optional :: poly_sparsity_order + logical, intent(inout), optional :: output_first_complex + integer, dimension(:, :), intent(inout), optional :: status_output + + ! Local variables + PetscErrorCode :: ierr + logical :: reuse_triggered, output_product, first_complex + integer :: i, i_sparse, sparsity_order + type(tMat) :: mat_product, temp_mat_A, temp_mat_two, temp_mat_three, mat_product_k_plus_1 + PetscReal :: square_sum, a_coeff + + ! ~~~~~~ + reuse_triggered = .NOT. PetscObjectIsNull(inv_matrix) + output_product = present(mat_prod_or_temp) + + if (.NOT. reuse_triggered) then + ! Duplicate & copy the matrix, but ensure there is a diagonal present + call mat_duplicate_copy_plus_diag(matrix, .FALSE., inv_matrix) + end if + + ! Set to zero as we add in each product of terms + call MatScale(inv_matrix, 0d0, ierr) + + ! Don't set any off processor entries so no need for a reduction when assembling + call MatSetOption(inv_matrix, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr) + + ! We start with an identity in mat_product + call generate_identity(matrix, mat_product) + if (output_product) then + status_output = 0 end if + sparsity_order = poly_order + if (present(poly_sparsity_order)) then + sparsity_order = poly_sparsity_order + end if + + ! If we're going to output the product as part of a fixed sparsity multiply, + ! we may be asking to constrain the sparsity to a power in between order and order + 2 + ! if there is a complex root at poly_sparsity_order + ! ie if we have roots (theta_1^r, theta_2^c, theta_3^c, theta_4^r) + ! where ^r means a purely real root and ^c means a complex root + ! want poly_sparsity_order = 1, we can't process all the way up to theta_3^c as that would + ! compute up to an A^2 term which is beyond our sparsity constraint + + ! i_sparse tells us how many roots we are going to process + ! Normally this would just be size(coefficients, 1) and the loop below goes up + ! to size(coefficients, 1) - 1. The last real root gets its final term added outside the loop + ! and if the last root is complex then we only have to hit the first of the pair in the loop + ! + ! If we have fixed sparsity: + ! + ! if the fixed sparsity root is real then we want to set i_sparse to poly_sparsity_order+1 + ! so we hit the roots up to poly_sparsity_order in the loop and then we take care of the + ! poly_sparsity_order + 1 root outside the loop + ! + ! if the fixed sparsity root is complex but poly_sparsity_order + 1 hits the second of the pair + ! then we only need to set i_sparse to poly_sparsity_order + 1 so we only hit the first + ! pair in the loop below + ! + ! if the fixed sparsity root is complex but poly_sparsity_order + 1 hits the first of the pair + ! then we need to set i_sparse to poly_sparsity_order + 2 + ! otherwise we would never hit the first pair + + i_sparse = size(coefficients, 1) + first_complex = .FALSE. + + if (output_product) then + + output_first_complex = .FALSE. + if (output_product) then + i_sparse = sparsity_order + 1 + + ! If the last root is real we don't have to do anything + if (coefficients(i_sparse,2) /= 0d0) then + + ! If the one before is real, then we know we're on the first + if (coefficients(i_sparse-1,2) == 0d0) then + output_first_complex = .TRUE. + ! See discussion above + i_sparse = i_sparse + 1 + + ! If the one before is complex + else + + ! Check if the distance between the fixed sparsity root and the one before + ! If > zero then they are not complex conjugates and hence we are on the first of the pair + if (abs(coefficients(i_sparse,1) - coefficients(i_sparse-1,1))/coefficients(i_sparse,1) > 1e-14 .OR. & + abs(coefficients(i_sparse,2) + coefficients(i_sparse-1,2))/coefficients(i_sparse,2) > 1e-14) then + output_first_complex = .TRUE. + i_sparse = i_sparse + 1 + end if + end if + end if + end if + first_complex = output_first_complex + end if + + ! ~~~~~~~~~~~~ + ! Iterate over the i + ! This is basically the same as the MF application but we have to build the powers + ! ~~~~~~~~~~~~ + i = 1 + ! Loop through to one fewer than the number of roots + ! We're always building up the next product + do while (i .le. i_sparse - 1) + + ! Duplicate & copy the matrix, but ensure there is a diagonal present + ! temp_mat_A is going to store things with the sparsity of A + if (PetscObjectIsNull(temp_mat_A)) then + call mat_duplicate_copy_plus_diag(matrix, .FALSE., temp_mat_A) + else + ! Can reuse the sparsity + call mat_duplicate_copy_plus_diag(matrix, .TRUE., temp_mat_A) + end if + + ! If real this is easy + if (coefficients(i,2) == 0d0) then + + ! Skips eigenvalues that are numerically zero + ! We still compute the entries as as zero because we need the sparsity + ! to be correct for the next iteration + if (abs(coefficients(i,1)) < 1e-12) then + square_sum = 0 + else + square_sum = 1d0/coefficients(i,1) + end if + + ! Then add the scaled version of each product + if (i == 1) then + ! If i == 1 then we know mat_product is identity so we can do it directly + call MatShift(inv_matrix, 1d0/coefficients(i,1), ierr) + else + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(inv_matrix, square_sum, mat_product, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(inv_matrix, square_sum, mat_product) + end if + end if + if (output_product) status_output(i, 1) = 1 + + ! temp_mat_A = A_ff/theta_k + call MatScale(temp_mat_A, -square_sum, ierr) + ! temp_mat_A = I - A_ff/theta_k + call MatShift(temp_mat_A, 1d0, ierr) + + ! mat_product_k_plus_1 = mat_product * temp_mat_A + if (i == 1) then + ! If i == 1 then we know mat_product is identity so we can just copy + call MatDestroy(mat_product, ierr) + call MatConvert(temp_mat_A, MATSAME, MAT_INITIAL_MATRIX, mat_product, ierr) + else + call MatMatMult(temp_mat_A, mat_product, & + MAT_INITIAL_MATRIX, 1.5d0, mat_product_k_plus_1, ierr) + call MatDestroy(mat_product, ierr) + mat_product = mat_product_k_plus_1 + end if + + ! We copy out the last product if we're doing this as part of a fixed sparsity multiply + if (output_product .AND. i == i_sparse - 1) then + call MatDestroy(mat_prod_or_temp, ierr) + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + end if + + i = i + 1 + + ! Complex + else + + ! Skips eigenvalues that are numerically zero + if (coefficients(i,1)**2 + coefficients(i,2)**2 < 1e-12) then + square_sum = 0 + a_coeff = 0 + else + square_sum = 1d0/(coefficients(i,1)**2 + coefficients(i,2)**2) + a_coeff = 2d0 * coefficients(i,1) + end if + + ! If our fixed sparsity root is the first of a complex conjugate pair + ! We want to pass out mat_product and only add that to inv_matrix + ! This is equivalent to only part of tmp on Line 9 of Loe + ! The fixed sparsity loop will then finish the tmp with the term -A * prod/(a^2+b^2) + ! as this is the part that would increase the sparsity beyond poly_sparsity_order + if (i == sparsity_order + 1 .AND. first_complex) then + + ! Copy mat_product into temp_mat_two + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) + + ! temp_mat_two = 2a * mat_product + call MatScale(temp_mat_two, a_coeff, ierr) + if (output_product) status_output(i, 2) = 1 + + ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply + if (output_product .AND. i > i_sparse - 2) then + call MatDestroy(mat_prod_or_temp, ierr) + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + end if + + ! Just do the normal loop + else + + ! temp_mat_A = -A + call MatScale(temp_mat_A, -1d0, ierr) + ! temp_mat_A = 2a I - A_ff + call MatShift(temp_mat_A, a_coeff, ierr) + if (output_product) then + status_output(i, 2) = 1 + status_output(i+1, 2) = 1 + end if + + if (i == 1) then + ! If i == 1 then we know mat_product is identity so we can do it directly + call MatConvert(temp_mat_A, MATSAME, MAT_INITIAL_MATRIX, temp_mat_two, ierr) + else + ! temp_mat_two = temp_mat_A * mat_product + call MatMatMult(temp_mat_A, mat_product, & + MAT_INITIAL_MATRIX, 1.5d0, temp_mat_two, ierr) + end if + + ! We copy out the last part of the old product if we're doing this as part of a fixed sparsity multiply + if (output_product .AND. i > i_sparse - 2) then + + call MatDestroy(mat_prod_or_temp, ierr) + call MatConvert(temp_mat_two, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + ! If i == 1 then we know mat_product is the identity and we don't bother + ! to write it out, we just have some custom code in the product given its trivial + if (i /= 1) then + ! This ensures it has the matching sparsity + call MatDestroy(mat_product_save, ierr) + call MatConvert(mat_prod_or_temp, MATSAME, MAT_INITIAL_MATRIX, mat_product_save, ierr) + ! This zeros mat_product_save and then puts mat_product into the sparsity pattern + ! of mat_prod_or_temp + call MatCopy(mat_product, mat_product_save, DIFFERENT_NONZERO_PATTERN, ierr) + end if + end if + end if + + ! Then add the scaled version of each product + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(inv_matrix, square_sum, & + temp_mat_two, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(inv_matrix, square_sum, temp_mat_two) + end if - mat_ctx%real_roots => coefficients(:, 1) - mat_ctx%imag_roots => coefficients(:, 2) - ! Now because the context reset deallocates the coefficient pointer - ! we want to make sure we don't leak memory, so we use pointer remapping here - ! to turn the 2D coefficient pointer into a 1D that we can store in mat_ctx%coefficients - ! and then the deallocate on mat_ctx%coefficients should still delete all the memory - mat_ctx%coefficients(1:2*size(coefficients,1)) => coefficients(:, :) - ! This is the matrix whose inverse we are applying (just copying the pointer here) - mat_ctx%mat = matrix - - end subroutine build_gmres_polynomial_newton_inverse + if (i .le. i_sparse - 2) then + + ! temp_mat_three = matrix * temp_mat_two + call MatMatMult(matrix, temp_mat_two, & + MAT_INITIAL_MATRIX, 1.5d0, temp_mat_three, ierr) + call MatDestroy(temp_mat_two, ierr) + if (output_product) then + status_output(i, 2) = 1 + end if + + ! Then add the scaled version of each product + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(mat_product, -square_sum, & + temp_mat_three, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(mat_product, -square_sum, temp_mat_three) + end if + + ! We copy out the last part of the product if we're doing this as part of a fixed sparsity multiply + if (output_product .AND. .NOT. first_complex) then + call MatDestroy(mat_prod_or_temp, ierr) + call MatConvert(mat_product, MATSAME, MAT_INITIAL_MATRIX, mat_prod_or_temp, ierr) + end if + + call MatDestroy(temp_mat_three, ierr) + else + call MatDestroy(temp_mat_two, ierr) + end if + + ! Skip two evals + i = i + 2 + + end if + end do + + ! Final step if last root is real + if (.NOT. first_complex) then + if (coefficients(i_sparse,2) == 0d0) then + ! Add in the final term multiplied by 1/theta_poly_order + + ! Skips eigenvalues that are numerically zero + if (abs(coefficients(i_sparse,1)) > 1e-12) then + + if (reuse_triggered) then + ! If doing reuse we know our nonzeros are a subset + call MatAXPY(inv_matrix, 1d0/coefficients(i_sparse,1), mat_product, SUBSET_NONZERO_PATTERN, ierr) + else + ! Have to use the DIFFERENT_NONZERO_PATTERN here + call MatAXPYWrapper(inv_matrix, 1d0/coefficients(i_sparse,1), mat_product) + end if + if (output_product) status_output(i_sparse, 1) = 1 + end if + end if + end if + + call MatDestroy(temp_mat_A, ierr) + call MatDestroy(mat_product, ierr) + + end subroutine build_gmres_polynomial_newton_inverse_full ! ------------------------------------------------------------------------------------------------------------------------------- diff --git a/src/Gmres_Polyk.kokkos.cxx b/src/Gmres_Polyk.kokkos.cxx index 071e646c..a1279ee2 100644 --- a/src/Gmres_Polyk.kokkos.cxx +++ b/src/Gmres_Polyk.kokkos.cxx @@ -67,14 +67,18 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in Mat *submatrices; // Pull out the nonlocal parts of the input mat we need + const PetscInt *colmap_input_mat; + PetscInt cols_ao_input = 0; if (mpi) { - PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local_input, &mat_nonlocal_input, NULL)); + PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local_input, &mat_nonlocal_input, &colmap_input_mat)); PetscCallVoid(MatMPIAIJGetSeqAIJ(*mat_sparsity_match, &mat_local_sparsity, &mat_nonlocal_sparsity, &colmap_mat_sparsity_match)); PetscCallVoid(MatGetSize(mat_nonlocal_sparsity, &rows_ao, &cols_ao)); PetscCallVoid(MatGetSize(mat_local_sparsity, &rows_ad, &cols_ad)); - + PetscInt rows_ao_input; + PetscCallVoid(MatGetSize(mat_nonlocal_input, &rows_ao_input, &cols_ao_input)); + // We need to pull out all the columns in the sparsity mat // and the nonlocal rows that correspond to the nonlocal columns // from the input mat @@ -175,6 +179,71 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local_output, &device_local_i_output, &device_local_j_output, &device_local_vals_output, &mtype)); if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal_output, &device_nonlocal_i_output, &device_nonlocal_j_output, &device_nonlocal_vals_output, &mtype)); + // ~~~~~~~~~~~~~~ + // Build a mapping from the input matrix's nonlocal column indices to the + // sparsity matrix's column space ("local" submat column space), which is defined as: + // [0..cols_ad-1] for local columns, [cols_ad..cols_ad+cols_ao-1] for sparsity colmap columns + // + // When doing the matrix-matrix product: + // 1. We need to compare local cols from local rows + // We need to access the local input matrix and we + // can do that directly given local indices are the same + // + // 2. We need to compare nonlocal cols from non-local rows + // We need to access submat for this which now only has the non-local rows in it + // Those will have a "local" column index that matches col_indices_off_proc_array given + // we create it with MatCreateSubMatrices + // + // 3. We need to compare nonlocal cols from local rows + // We need to access the input_matrix for this + // But (for higher order fixed sparsity) the colmap of the input matrix is not the same + // as the colmap of the sparsity matrix + // So below we create a mapping that converts from the input matrix's nonlocal column indices + // to the "local" column indices of the submat (which correspond to the sparsity matrix's column space) for the nonlocal columns + // If there are not matching entries in the sparsity colmap, we use a large sentinel value that will never + // match any col_orig and preserves sorted order. + // + // This mapping is needed because the input matrix and sparsity matrix may have + // different colmaps when poly_sparsity_order >= 2. + // ~~~~~~~~~~~~~~ + + // Use a sentinel larger than any valid column index + const PetscInt COLMAP_NOT_FOUND = cols_ad + cols_ao + 1; + + auto input_nonlocal_to_submat_col_d = PetscIntKokkosView("input_nonlocal_to_submat_col_d", mpi ? cols_ao_input : 1); + if (mpi && cols_ao_input > 0) + { + // Build the mapping on the host + // Both colmaps are sorted, so we can do a merge-style scan + auto input_nonlocal_to_submat_col_h = Kokkos::create_mirror_view(input_nonlocal_to_submat_col_d); + PetscInt sparsity_colmap_idx = 0; + for (PetscInt k = 0; k < cols_ao_input; k++) + { + PetscInt global_col = colmap_input_mat[k]; + // Advance the sparsity colmap index (both are sorted) + while (sparsity_colmap_idx < cols_ao && colmap_mat_sparsity_match[sparsity_colmap_idx] < global_col) + { + sparsity_colmap_idx++; + } + if (sparsity_colmap_idx < cols_ao && colmap_mat_sparsity_match[sparsity_colmap_idx] == global_col) + { + input_nonlocal_to_submat_col_h(k) = cols_ad + sparsity_colmap_idx; + } + else + { + // Not found — use sentinel value that preserves sort order + // Since colmap_input is sorted and colmap_sparsity is sorted, + // if an entry is missing it's between two found entries, + // so we assign a value that maintains monotonicity + input_nonlocal_to_submat_col_h(k) = COLMAP_NOT_FOUND; + } + } + Kokkos::deep_copy(input_nonlocal_to_submat_col_d, input_nonlocal_to_submat_col_h); + // Log copy with petsc + bytes = input_nonlocal_to_submat_col_h.extent(0) * sizeof(PetscInt); + PetscCallVoid(PetscLogCpuToGpu(bytes)); + } + // ~~~~~~~~~~~~~~ // Find maximum non-zeros per row for sizing scratch memory // ~~~~~~~~~~~~~~ @@ -358,8 +427,11 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in while (idx_col_of_row_i < ncols_row_i && idx_col_of_row_j < ncols_row_of_col_j) { // The col_target is the column we are trying to match in the row of column j - // We always convert it to the "local" indexing as if it were in the columns of the submat, ie - // the column indexing of [local cols; local cols + 0:cols_ao-1] + // We convert everything to the submat "local" column space for comparison, ie + // the column indexing of [0..cols_ad-1 for local cols; cols_ad+k for sparsity colmap[k]] + // When the input matrix and sparsity matrix have different colmaps + // (poly_sparsity_order >= 2), we use the input_nonlocal_to_submat_col_d mapping + // to convert the input matrix's nonlocal column indices to the sparsity colmap space PetscInt col_target; if (row_of_col_j_local) { @@ -369,8 +441,11 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in } else { - // Convert to "local" column index of submat by adding cols_ad - col_target = device_nonlocal_j_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j] + cols_ad; + // This is the case where we need to access non-local columns in local rows of input_matrix + // and hence we need our mapping + // Convert nonlocal column index from input matrix's colmap space + // to the to "local" column index of submat + col_target = input_nonlocal_to_submat_col_d(device_nonlocal_j_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j]); } } else @@ -390,40 +465,45 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in // Convert to "local" column index of submat by adding cols_ad col_orig = device_nonlocal_j_sparsity[device_nonlocal_i_sparsity[i] + (idx_col_of_row_i - local_cols_row_i)] + cols_ad; } - - if (col_orig < col_target) { - // Original column is smaller, move to next original column - idx_col_of_row_i++; - } else if (col_orig > col_target) { - // Target column is smaller, move to next target column + + // Skip entries where the input column doesn't exist in the sparsity pattern + if (col_target == COLMAP_NOT_FOUND) { idx_col_of_row_j++; - // We've found a matching index and hence we can do our compute } else { - - PetscReal val_target; - if (row_of_col_j_local) - { - if (idx_col_of_row_j < local_cols_row_of_col_j) + if (col_orig < col_target) { + // Original column is smaller, move to next original column + idx_col_of_row_i++; + } else if (col_orig > col_target) { + // Target column is smaller, move to next target column + idx_col_of_row_j++; + // We've found a matching index and hence we can do our compute + } else { + + PetscReal val_target; + if (row_of_col_j_local) { - val_target = device_local_vals_input[device_local_i_input[row_of_col_j] + idx_col_of_row_j]; + if (idx_col_of_row_j < local_cols_row_of_col_j) + { + val_target = device_local_vals_input[device_local_i_input[row_of_col_j] + idx_col_of_row_j]; + } + else + { + val_target = device_nonlocal_vals_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j]; + } } else { - val_target = device_nonlocal_vals_input[device_nonlocal_i_input[row_of_col_j] + idx_col_of_row_j - local_cols_row_of_col_j]; + val_target = device_submat_vals[device_submat_i[row_of_col_j] + idx_col_of_row_j]; } - } - else - { - val_target = device_submat_vals[device_submat_i[row_of_col_j] + idx_col_of_row_j]; - } - // Has to be atomic! Potentially lots of contention so maybe not - // the most performant way to do this - Kokkos::atomic_add(&vals_temp[idx_col_of_row_i], vals_prev[j] * val_target); + // Has to be atomic! Potentially lots of contention so maybe not + // the most performant way to do this + Kokkos::atomic_add(&vals_temp[idx_col_of_row_i], vals_prev[j] * val_target); - // Move forward in both arrays - idx_col_of_row_i++; - idx_col_of_row_j++; + // Move forward in both arrays + idx_col_of_row_i++; + idx_col_of_row_j++; + } } } }); diff --git a/src/PCPFLAREINV.c b/src/PCPFLAREINV.c index b7f3a8e3..8b6826bd 100644 --- a/src/PCPFLAREINV.c +++ b/src/PCPFLAREINV.c @@ -18,8 +18,8 @@ PETSC_EXTERN void calculate_and_build_approximate_inverse_c(Mat *input_mat, Pets // // PFLAREINV_POWER - GMRES polynomial with the power basis // PFLAREINV_ARNOLDI - GMRES polynomial with the arnoldi basis -// PFLAREINV_NEWTON - GMRES polynomial with the newton basis with extra roots for stability - can only be used matrix-free atm -// PFLAREINV_NEWTON_NO_EXTRA - GMRES polynomial with the newton basis without extra roots - can only be used matrix-free atm +// PFLAREINV_NEWTON - GMRES polynomial with the newton basis with extra roots for stability +// PFLAREINV_NEWTON_NO_EXTRA - GMRES polynomial with the newton basis without extra roots // PFLAREINV_NEUMANN - Neumann polynomial // PFLAREINV_SAI - SAI - cannot be used matrix-free atm // PFLAREINV_ISAI - Incomplete SAI - cannot be used matrix-free atm @@ -335,11 +335,6 @@ static PetscErrorCode PCSetUp_PFLAREINV_c(PC pc) // ~~~~~~~ PetscCall(PCPFLAREINVGetType(pc, &type)); - // Newton has to be matrix free - if (type == PFLAREINV_NEWTON || type == PFLAREINV_NEWTON_NO_EXTRA) - { - PetscCheck(inv_data->matrix_free, comm, PETSC_ERR_ARG_WRONGSTATE, "GMRES polynomial with Newton basis must be applied matrix-free"); - } // SAI/ISAI can't be matrix free if (type == PFLAREINV_SAI || type == PFLAREINV_ISAI) { diff --git a/tests/Makefile b/tests/Makefile index 5dd2e454..f86f49ce 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -93,6 +93,8 @@ run_tests_load_serial: ./ex6 -f data/mat_stream_2364 -pc_type pflareinv -ksp_max_it 21 @echo "Test single level GMRES polynomial preconditioning for hyperbolic streaming problem in C" ./ex6 -f data/mat_stream_2364 -pc_type pflareinv -pc_pflareinv_type power -ksp_max_it 21 + @echo "Test single level Newton GMRES polynomial preconditioning for hyperbolic streaming problem in C" + ./ex6 -f data/mat_stream_2364 -pc_type pflareinv -pc_pflareinv_type newton -ksp_max_it 21 # @echo "" @echo "Test single level GMRES polynomial preconditioning with the Newton basis matrix-free for hyperbolic streaming problem in C" @@ -121,6 +123,42 @@ run_tests_load_serial: @echo "" @echo "Test AIRG with GMRES polynomials in indefinite problem with zero diagonals" ./ex6 -f data/e05r0100_petsc -b_in_f 0 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 26 + @echo "Test AIRG with Newton GMRES polynomials in indefinite problem with zero diagonals" + ./ex6 -f data/e05r0100_petsc -b_in_f 0 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 26 +# + @echo "" + @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 2nd order" + ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 2 + @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 3rd order" + ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 3 +# + @echo "" + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders" + @set -e; for order in 0 1 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \ + done + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders and fixed sparsity" + @set -e; for order in 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + for sparsity in $$(seq 1 $$(($$order - 1))); do \ + echo " --- Testing sparsity order = $$sparsity ---"; \ + ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$sparsity; \ + done; \ + done +# + @echo "" + @echo "Test Newton GMRES polynomials order 60 matrix-free with added roots in market matrix problem 1138" + ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 60 -pc_pflareinv_matrix_free -ksp_norm_type unpreconditioned -ksp_max_it 6 + @echo "Test Newton GMRES polynomials order 60 with fixed sparsity with added roots in market matrix problem 1138" + ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 60 -ksp_norm_type unpreconditioned -ksp_max_it 6 + @echo "Test Newton GMRES polynomials order 120 matrix-free with added roots in market matrix problem 1138" + ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 120 -pc_pflareinv_matrix_free -ksp_norm_type unpreconditioned -ksp_max_it 5 + @echo "Test Newton GMRES polynomials order 120 with fixed sparsity with added roots in market matrix problem 1138" + ./ex6 -b_in_f 0 -f data/1138_bus -pc_type pflareinv -pc_pflareinv_type newton -pc_pflareinv_order 120 -ksp_norm_type unpreconditioned -ksp_max_it 5 +# + @echo "Test Newton AIRG with GMRES polynomials for hyperbolic streaming problem, matrix-free smoothing" + ./ex12f -f data/mat_stream_2364 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -pc_air_matrix_free_polys -ksp_max_it 5 # ~~~~~~~~~~~ # ~~~~~~~~~~~ @@ -156,6 +194,30 @@ run_tests_load_parallel: @echo "" @echo "Test AIRG with GMRES polynomials in indefinite problem with zero diagonals in parallel" $(MPIEXEC) -n 2 ./ex6 -f data/e05r0100_petsc -b_in_f 0 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 26 +# + @echo "" + @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 2nd order in parallel" + $(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 2 + @echo "Test AIRG with assembled Newton GMRES polynomials for hyperbolic streaming problem for 3rd order in parallel" + $(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -ksp_max_it 5 -pc_air_inverse_type newton -pc_air_poly_order 3 +# + @echo "" + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders in parallel" + @set -e; for order in 0 1 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$order; \ + done + @echo "Test difference between GMRES polynomial forms for hyperbolic streaming problem for different orders and fixed sparsity in parallel" + @set -e; for order in 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + for sparsity in $$(seq 1 $$(($$order - 1))); do \ + echo " --- Testing sparsity order = $$sparsity ---"; \ + $(MPIEXEC) -n 2 ./ex12f_gmres_poly -f data/mat_stream_2364 -pc_air_poly_order $$order -pc_air_inverse_sparsity_order $$sparsity; \ + done; \ + done +# + @echo "Test Newton AIRG with GMRES polynomials for hyperbolic streaming problem, matrix-free smoothing in parallel" + $(MPIEXEC) -n 2 ./ex12f -f data/mat_stream_2364 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -pc_air_matrix_free_polys -ksp_max_it 5 # ~~~~~~~~~~~ # ~~~~~~~~~~~ @@ -260,6 +322,32 @@ run_tests_no_load_serial: ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 10 -pc_air_inverse_sparsity_order 0 @echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change with 0th order fixed sparsity" ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0 +# + @echo "" + @echo "Test AIRG Newton with 0th order GMRES polynomials with PC reused with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 0 + @echo "Test AIRG Newton with 0th order GMRES polynomials with PC regenerated with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 0 +# + @echo "" + @echo "Test AIRG Newton with 1st order GMRES polynomials with PC reused with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 1 + @echo "Test AIRG Newton with 1st order GMRES polynomials with PC regenerated with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 1 +# + @echo "" + @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC reused with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 + @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC regenerated with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 + @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 +# + @echo "" + @echo "Test AIRG Newton with GMRES polynomials with PC reused with no sparsity change with 0th order fixed sparsity" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -ksp_max_it 10 -pc_air_inverse_sparsity_order 0 + @echo "Test AIRG Newton with GMRES polynomials with PC regenerated with no sparsity change with 0th order fixed sparsity" + ./ex6f -m 10 -n 10 -pc_air_a_drop 1e-3 -pc_air_inverse_type newton -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0 # @echo "" @echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change and polynomial coeffs stored" @@ -351,7 +439,32 @@ run_tests_no_load_serial: -pc_air_improve_w_its 3 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power @echo "Test improving Z with PC regenerated with no sparsity change with 1 iteration" ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -pc_air_one_point_classical_prolong 0 \ - -pc_air_improve_w_its 1 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power + -pc_air_improve_w_its 1 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power +# + @echo "" + @echo "Test Newton AIRG on advection for for different orders" + @set -e; for order in 0 1 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ + -pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 6; \ + done + @echo "Test Newton AIRG on advection for for different orders and fixed sparsity" + @set -e; for order in 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + for sparsity in $$(seq 1 $$(($$order - 1))); do \ + echo " --- Testing sparsity order = $$sparsity ---"; \ + ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ + -pc_air_inverse_sparsity_order $$sparsity -ksp_norm_type unpreconditioned -ksp_max_it 5; \ + done; \ + done +# + @echo "" + @echo "Test Newton GMRES polynomial on a diagonal matrix" + ./mat_diag -ksp_max_it 1 + @echo "Test Arnoldi GMRES polynomial on a diagonal matrix" + ./mat_diag -pc_pflareinv_type arnoldi -ksp_max_it 1 + @echo "Test Power GMRES polynomial on a diagonal matrix" + ./mat_diag -pc_pflareinv_type power -ksp_max_it 1 # # ~~~~~~~~~~~~~~~~~~~~~~~ # Include kokkos examples @@ -484,6 +597,41 @@ run_tests_no_load_parallel: @echo "Test AIRG with GMRES polynomials with PC regenerated with no sparsity change in parallel with 0th order fixed sparsity" $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0 \ -pc_air_a_drop 1e-3 -pc_air_inverse_type power +# + @echo "" + @echo "Test AIRG Newton with 0th order GMRES polynomials with PC reused with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_poly_order 0 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with 0th order GMRES polynomials with PC regenerated with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 0 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton +# + @echo "" + @echo "Test AIRG Newton with 1st order GMRES polynomials with PC reused with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_poly_order 1 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with 1st order GMRES polynomials with PC regenerated with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 1 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton +# + @echo "" + @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC reused with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with 2nd order GMRES polynomials with PC regenerated with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_poly_order 2 -pc_air_inverse_sparsity_order 2 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with 2nd order fixed sparsity GMRES polynomials with PC regenerated with no sparsity change in parallel" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 2 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton +# + @echo "" + @echo "Test AIRG Newton with GMRES polynomials with PC reused with no sparsity change in parallel with 0th order fixed sparsity" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -ksp_max_it 10 -pc_air_inverse_sparsity_order 0 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton + @echo "Test AIRG Newton with GMRES polynomials with PC regenerated with no sparsity change in parallel with 0th order fixed sparsity" + $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -ksp_max_it 3 -pc_air_inverse_sparsity_order 0 \ + -pc_air_a_drop 1e-3 -pc_air_inverse_type newton # @echo "" @echo "Test solving isotropic diffusion with fast coarsening and near-nullspace in parallel" @@ -514,6 +662,23 @@ run_tests_no_load_parallel: @echo "Test improving W with PC regenerated with no sparsity change in parallel" $(MPIEXEC) -n 2 ./ex6f -m 10 -n 10 -regen -pc_air_reuse_sparsity -pc_air_one_point_classical_prolong 0 \ -pc_air_improve_w_its 3 -ksp_max_it 3 -pc_air_a_drop 1e-3 -pc_air_inverse_type power +# + @echo "" + @echo "Test Newton AIRG on advection for for different orders in parallel" + @set -e; for order in 0 1 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + $(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ + -pc_air_inverse_sparsity_order $$order -ksp_norm_type unpreconditioned -ksp_max_it 6; \ + done + @echo "Test Newton AIRG on advection for for different orders and fixed sparsity in parallel" + @set -e; for order in 2 3 4 5 6; do \ + echo "--- Testing order = $$order ---"; \ + for sparsity in $$(seq 1 $$(($$order - 1))); do \ + echo " --- Testing sparsity order = $$sparsity ---"; \ + $(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x 50 -da_grid_y 50 -pc_type air -pc_air_inverse_type newton -pc_air_poly_order $$order \ + -pc_air_inverse_sparsity_order $$sparsity -ksp_norm_type unpreconditioned -ksp_max_it 5; \ + done; \ + done # # ~~~~~~~~~~~~~~~~~~~~~~~ @@ -535,7 +700,7 @@ run_tests_medium_serial: # @echo "" - @for size in $$(seq 100 20 200); do \ + @set -e; for size in $$(seq 100 20 200); do \ echo "--- Testing size = $$size x $$size ---"; \ ./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \ -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \ @@ -543,7 +708,7 @@ run_tests_medium_serial: # @echo "" @echo "--- Running scaling study on adv_diff_2d ---" - @for size in 100 200 400 800; do \ + @set -e; for size in 100 200 400 800; do \ echo "--- Testing size = $$size x $$size ---"; \ ./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \ -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \ @@ -555,7 +720,7 @@ run_tests_medium_parallel: # @echo "" - @for size in $$(seq 100 20 200); do \ + @set -e; for size in $$(seq 100 20 200); do \ echo "--- Testing size = $$size x $$size ---"; \ $(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \ -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \ @@ -563,7 +728,7 @@ run_tests_medium_parallel: # @echo "" @echo "--- Running scaling study on adv_diff_2d in parallel ---" - @for size in 100 200 400 800; do \ + @set -e; for size in 100 200 400 800; do \ echo "--- Testing size = $$size x $$size ---"; \ $(MPIEXEC) -n 2 ./adv_diff_2d -da_grid_x $$size -da_grid_y $$size -pc_type air -ksp_pc_side right \ -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-5 -pc_air_strong_threshold 0.99 -ksp_max_it 6; \ diff --git a/tests/data/1138_bus b/tests/data/1138_bus new file mode 100644 index 00000000..dc745535 Binary files /dev/null and b/tests/data/1138_bus differ diff --git a/tests/data/1138_bus.info b/tests/data/1138_bus.info new file mode 100644 index 00000000..270767f1 --- /dev/null +++ b/tests/data/1138_bus.info @@ -0,0 +1 @@ +-matload_block_size 1 diff --git a/tests/ex12f_gmres_poly.F90 b/tests/ex12f_gmres_poly.F90 new file mode 100644 index 00000000..c0d188e0 --- /dev/null +++ b/tests/ex12f_gmres_poly.F90 @@ -0,0 +1,182 @@ +! + program main +#include + use petscksp +#include "finclude/pflare.h" + implicit none + +! Comparison between different forms of GMRES polynomials + + PetscErrorCode ierr + PetscInt m,n,mlocal,nlocal + PetscBool flg, check, no_power + PetscReal norm_power, norm_rhs, norm_arnoldi, norm_newton + PetscReal :: norm_diff_one, norm_diff_two + Vec x,b,u, b_diff_type + Mat A, A_diff_type + character*(128) f + PetscViewer fd + KSP ksp + PC pc + KSPConvergedReason reason + PetscInt, parameter :: one=1 + MatType :: mtype, mtype_input + + call PetscInitialize(PETSC_NULL_CHARACTER,ierr) + if (ierr .ne. 0) then + print*,'Unable to initialize PETSc' + stop + endif + +! Read in matrix and RHS + call PetscOptionsGetString(PETSC_NULL_OPTIONS, & + & PETSC_NULL_CHARACTER,'-f',f,flg,ierr) + call PetscViewerBinaryOpen(PETSC_COMM_WORLD,f,FILE_MODE_READ, & + & fd,ierr) + no_power = PETSC_FALSE + ! Our CI has an intel pipeline and the intel MPI breaks with the power basis + ! so we can disable the power basis test with a command line option + call PetscOptionsGetBool(PETSC_NULL_OPTIONS,PETSC_NULL_CHARACTER, & + '-no_power', check,flg,ierr) + if (flg) no_power = check + + call MatCreate(PETSC_COMM_WORLD,A,ierr) + call MatLoad(A,fd,ierr) + + ! Get information about matrix + call MatGetSize(A,m,n,ierr) + call MatGetLocalSize(A,mlocal,nlocal,ierr) + + call VecCreate(PETSC_COMM_WORLD,b,ierr) + call VecLoad(b,fd,ierr) + call PetscViewerDestroy(fd,ierr) + + ! Test and see if the user wants us to use a different matrix type + ! with -mat_type on the command line + ! This lets us easily test our cpu and kokkos versions through our CI + call MatCreateFromOptions(PETSC_COMM_WORLD,PETSC_NULL_CHARACTER,& + one,mlocal,nlocal,m,n,A_diff_type,ierr) + call MatAssemblyBegin(A_diff_type,MAT_FINAL_ASSEMBLY,ierr) + call MatAssemblyEnd(A_diff_type,MAT_FINAL_ASSEMBLY,ierr) + + call MatGetType(A, mtype, ierr) + call MatGetType(A_diff_type, mtype_input, ierr) + + if (mtype /= mtype_input) then + ! Doesn't seem like there is a converter to kokkos + ! So instead we just copy into the empty A_diff_type + ! This will be slow as its not preallocated, but this is just for testing + call MatCopy(A, A_diff_type, DIFFERENT_NONZERO_PATTERN, ierr) + call MatDestroy(A,ierr) + A = A_diff_type + + ! Mat and vec types have to match + call VecCreateFromOptions(PETSC_COMM_WORLD,PETSC_NULL_CHARACTER, & + one,nlocal,n,b_diff_type,ierr) + call VecCopy(b,b_diff_type,ierr) + call VecDestroy(b,ierr) + b = b_diff_type + + else + call MatDestroy(A_diff_type,ierr) + end if + + ! Set up solution + call VecDuplicate(b,x,ierr) + call VecDuplicate(b,u,ierr) + + ! Register the pflare types + call PCRegister_PFLARE() + + call VecNorm(b,NORM_2,norm_rhs,ierr) + + ! ~~~~~~~~~~~~~ + ! Do a solve with the Arnoldi basis + ! ~~~~~~~~~~~~~ + call KSPCreate(PETSC_COMM_WORLD,ksp,ierr) + call KSPSetOperators(ksp,A,A,ierr) + call KSPGetPC(ksp, pc, ierr) + call PCSetType(pc, PCAIR, ierr) + call PCAIRSetInverseType(pc, PFLAREINV_ARNOLDI, ierr) + call KSPSetPC(ksp, pc, ierr) + call KSPSetFromOptions(ksp,ierr) + call VecSet(x, 0d0, ierr) + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp,reason,ierr) + if (reason%v < 0) then + error stop 1 + end if + ! Compute the residual + call MatMult(A,x,u,ierr) + call VecAXPY(u,-1d0,b,ierr) + call VecNorm(u,NORM_2,norm_arnoldi,ierr) + norm_arnoldi = norm_arnoldi/norm_rhs + + ! ~~~~~~~~~~~~~ + ! Now do a solve with the Power basis + ! ~~~~~~~~~~~~~ + if (.NOT. no_power) then + call PCAIRSetInverseType(pc, PFLAREINV_POWER, ierr) + + call VecSet(x, 0d0, ierr) + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp,reason,ierr) + if (reason%v < 0) then + error stop 1 + end if + ! Compute the residual + call MatMult(A,x,u,ierr) + call VecAXPY(u,-1d0,b,ierr) + call VecNorm(u,NORM_2,norm_power,ierr) + norm_power = norm_power/norm_rhs + end if + + ! ~~~~~~~~~~~~~ + ! Now do a solve with the Newton basis + ! ~~~~~~~~~~~~~ + call PCAIRSetInverseType(pc, PFLAREINV_NEWTON, ierr) + + call VecSet(x, 0d0, ierr) + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp,reason,ierr) + if (reason%v < 0) then + error stop 1 + end if + ! Compute the residual + call MatMult(A,x,u,ierr) + call VecAXPY(u,-1d0,b,ierr) + call VecNorm(u,NORM_2,norm_newton,ierr) + norm_newton = norm_newton/norm_rhs + call KSPDestroy(ksp,ierr) + + ! ~~~~~~~~~~~~~ + ! Now check all the residuals are the same + ! For low order polynomials on the diagonally dominant + ! A_ff on each level they should be basically identical and hence + ! we should have almost no difference in the resulting residual + ! ~~~~~~~~~~~~~ + norm_diff_one = abs(norm_arnoldi - norm_newton)/norm_arnoldi + if (norm_diff_one > 1e-9) then + print *, "Residuals differ between polynomial bases!", norm_diff_one + print *, "Newton basis residual: ", norm_newton + print *, "Arnoldi basis residual: ", norm_arnoldi + error stop 1 + end if + if (.NOT. no_power) then + norm_diff_two = abs(norm_arnoldi - norm_power)/norm_arnoldi + if (norm_diff_two > 1e-9) then + print *, "Residuals differ between polynomial bases!", norm_diff_two + print *, "Power basis residual: ", norm_power + print *, "Arnoldi basis residual: ", norm_arnoldi + error stop 1 + end if + end if + + call VecDestroy(b,ierr) + call VecDestroy(x,ierr) + call VecDestroy(u,ierr) + call MatDestroy(A,ierr) + + call PetscFinalize(ierr) + + end \ No newline at end of file diff --git a/tests/mat_diag.F90 b/tests/mat_diag.F90 new file mode 100644 index 00000000..1d6a54af --- /dev/null +++ b/tests/mat_diag.F90 @@ -0,0 +1,119 @@ +#include +#include "finclude/pflare.h" + use petscksp + implicit none + + ! Test that the gmres polynomials can handle small solve of diagonal matrix + ! We leave the polynomial order here as the default (which is 6), despite + ! the fact that much lower polynomial order is an exact solve in this case + ! This tests that the various gmres polynomial methods correctly + ! identify we only need up to lower order + + PetscErrorCode :: ierr + Mat :: A + PetscInt :: m, n, nnzs + PetscInt, parameter :: one = 1, two = 2, three = 3, zero = 0 + Vec :: x,b + KSP :: ksp + PC :: pc + PetscBool :: flg + KSPConvergedReason reason + PetscRandom rctx + + call PetscInitialize(PETSC_NULL_CHARACTER,ierr) + ! Register the pflare types + call PCRegister_PFLARE() + + m = 10 + n = 10 + call PetscOptionsGetInt(PETSC_NULL_OPTIONS,PETSC_NULL_CHARACTER,'-m',m,flg,ierr) + call PetscOptionsGetInt(PETSC_NULL_OPTIONS,PETSC_NULL_CHARACTER,'-n',n,flg,ierr) + + ! Create matrix + call MatCreate(PETSC_COMM_WORLD,A,ierr) + call MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,m*n,m*n,ierr) + call MatSetFromOptions(A,ierr) + nnzs = m; + call MatSeqAIJSetPreallocation(A, nnzs, PETSC_NULL_INTEGER_ARRAY, ierr) + call MatMPIAIJSetPreallocation(A, nnzs, PETSC_NULL_INTEGER_ARRAY, nnzs, PETSC_NULL_INTEGER_ARRAY, ierr) + call MatSetUp(A,ierr) + call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr) + call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr) + call MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE, ierr) + + call MatCreateVecs(A,b,x,ierr) + call VecSet(x, 0d0, ierr) + + ! Random rhs + call PetscRandomCreate(PETSC_COMM_WORLD, rctx, ierr) + call PetscRandomSetFromOptions(rctx, ierr) + call VecSetRandom(b, rctx, ierr) + call PetscRandomDestroy(rctx, ierr) + + ! ~~~~~~~~~~~~~~ + ! Set constant diagonal values in matrix + ! In Newton form should only need a single root + ! (ie a 0th order polynomial) for an exact solve + ! Starting with the identity, the inverse should also be the identity + ! ~~~~~~~~~~~~~~ + call MatShift(A, 1d0, ierr) + + call KSPCreate(PETSC_COMM_WORLD,ksp,ierr) + call KSPSetOperators(ksp,A,A,ierr) + call KSPGetPC(ksp,pc,ierr) + ! Set newton gmres polynomial as PC + call PCSetType(pc, PCPFLAREINV, ierr) + call PCPFLAREINVSetType(pc, PFLAREINV_NEWTON, ierr) + call KSPSetPC(ksp, pc, ierr) + call KSPSetFromOptions(ksp,ierr) + call KSPSetUp(ksp,ierr) + + ! Do the solve + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp, reason, ierr) + if (reason%v < 0) then + error stop 1 + end if + + ! ~~~~~~~~~~~~~~ + ! Instead now set the diagonal to 1.5 + ! In Newton form should only need a single root + ! ~~~~~~~~~~~~~~ + call MatShift(A, 0.5d0, ierr) + call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr) + call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr) + call VecSet(x, 0d0, ierr) + + ! Do another solve - this will automatically trigger the setup as the matrix + ! has changed + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp, reason, ierr) + if (reason%v < 0) then + error stop 1 + end if + + ! ~~~~~~~~~~~~~~ + ! Instead now have two different constant values in the diagonal + ! In Newton form should only need two roots + ! (ie a 1st order polynomial) for an exact solve + ! ~~~~~~~~~~~~~~ + ! Set one of the values to 2.5 + call MatSetValue(A, zero, zero, 2.5d0, INSERT_VALUES, ierr) + call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr) + call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr) + call VecSet(x, 0d0, ierr) + + ! Do another solve - this will automatically trigger the setup as the matrix + ! has changed + call KSPSolve(ksp,b,x,ierr) + call KSPGetConvergedReason(ksp, reason, ierr) + if (reason%v < 0) then + error stop 1 + end if + + call MatDestroy(A, ierr) + call VecDestroy(b, ierr) + call VecDestroy(x, ierr) + call KSPDestroy(ksp, ierr) + call PetscFinalize(ierr) + end \ No newline at end of file