From fb0de46755e8776ac8dd965b61c6934844deca20 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Fri, 30 Aug 2024 12:26:12 +1000 Subject: [PATCH 01/14] Updated to PartitionedArrays v0.5 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 1dc2bd64..494e3732 100644 --- a/Project.toml +++ b/Project.toml @@ -20,7 +20,7 @@ FillArrays = "0.8.4,1" Gridap = "0.18.3" LinearAlgebra = "1.3" MPI = "0.16, 0.17, 0.18, 0.19, 0.20" -PartitionedArrays = "0.3.3" +PartitionedArrays = "0.5" SparseArrays = "1.3" SparseMatricesCSR = "0.6.6" WriteVTK = "1.12.0" From 9678cbd448f874513af8f99af1ee53089fbaa390 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Fri, 30 Aug 2024 12:26:34 +1000 Subject: [PATCH 02/14] Assembly working, no reuse --- src/Algebra.jl | 174 ++++++++++++++-------------- src/BlockPartitionedArrays.jl | 4 +- src/FESpaces.jl | 4 +- test/BlockPartitionedArraysTests.jl | 2 +- test/parrays_update.jl | 36 ++++++ 5 files changed, 128 insertions(+), 92 deletions(-) create mode 100644 test/parrays_update.jl diff --git a/src/Algebra.jl b/src/Algebra.jl index 57bcbac3..11af9149 100644 --- a/src/Algebra.jl +++ b/src/Algebra.jl @@ -62,8 +62,8 @@ function Algebra.axpy_entries!( # We should definitely check here that the index partitions are the same. # However: Because the different matrices are assembled separately, the objects are not the # same (i.e can't use ===). Checking the index partitions would then be costly... - @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1)))) - @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2)))) + @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1)))) + @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2)))) map(partition(A),partition(B)) do A, B Algebra.axpy_entries!(α,A,B;check) end @@ -394,13 +394,14 @@ struct PSparseMatrixBuilderCOO{T,B} end function Algebra.nz_counter( - builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}) where A - test_dofs_gids_prange, trial_dofs_gids_prange = axs - counters = map(partition(test_dofs_gids_prange),partition(trial_dofs_gids_prange)) do r,c + builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange} +) where A + rows, cols = axs # test ids, trial ids + counters = map(partition(rows),partition(cols)) do r,c axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c))) Algebra.CounterCOO{A}(axs) end - DistributedCounterCOO(builder.par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange) + DistributedCounterCOO(builder.par_strategy,counters,rows,cols) end function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv @@ -414,56 +415,58 @@ end struct DistributedCounterCOO{A,B,C,D} <: GridapType par_strategy::A counters::B - test_dofs_gids_prange::C - trial_dofs_gids_prange::D + test_gids::C + trial_gids::D function DistributedCounterCOO( par_strategy, counters::AbstractArray{<:Algebra.CounterCOO}, - test_dofs_gids_prange::PRange, - trial_dofs_gids_prange::PRange) + test_gids::PRange, + trial_gids::PRange) A = typeof(par_strategy) B = typeof(counters) - C = typeof(test_dofs_gids_prange) - D = typeof(trial_dofs_gids_prange) - new{A,B,C,D}(par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange) + C = typeof(test_gids) + D = typeof(trial_gids) + new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids) end end function local_views(a::DistributedCounterCOO) - a.counters + return a.counters end -function local_views(a::DistributedCounterCOO,test_dofs_gids_prange,trial_dofs_gids_prange) - @check test_dofs_gids_prange === a.test_dofs_gids_prange - @check trial_dofs_gids_prange === a.trial_dofs_gids_prange - a.counters +function local_views(a::DistributedCounterCOO,test_gids,trial_gids) + @check PArrays.matching_local_indices(test_gids,a.test_gids) + @check PArrays.matching_local_indices(trial_gids,a.trial_gids) + return a.counters end function Algebra.nz_allocation(a::DistributedCounterCOO) allocs = map(nz_allocation,a.counters) - DistributedAllocationCOO(a.par_strategy,allocs,a.test_dofs_gids_prange,a.trial_dofs_gids_prange) + DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids) end struct DistributedAllocationCOO{A,B,C,D} <: GridapType par_strategy::A allocs::B - test_dofs_gids_prange::C - trial_dofs_gids_prange::D + test_gids::C + trial_gids::D function DistributedAllocationCOO( par_strategy, allocs::AbstractArray{<:Algebra.AllocationCOO}, - test_dofs_gids_prange::PRange, - trial_dofs_gids_prange::PRange) + test_gids::PRange, + trial_gids::PRange) A = typeof(par_strategy) B = typeof(allocs) - C = typeof(test_dofs_gids_prange) - D = typeof(trial_dofs_gids_prange) - new{A,B,C,D}(par_strategy,allocs,test_dofs_gids_prange,trial_dofs_gids_prange) + C = typeof(test_gids) + D = typeof(trial_gids) + new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids) end end -function change_axes(a::DistributedAllocationCOO{A,B,<:PRange,<:PRange}, - axes::Tuple{<:PRange,<:PRange}) where {A,B} +function change_axes( + a::DistributedAllocationCOO{A,B,<:PRange,<:PRange}, + axes::Tuple{<:PRange,<:PRange} +) where {A,B} local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols))) end @@ -471,8 +474,10 @@ function change_axes(a::DistributedAllocationCOO{A,B,<:PRange,<:PRange}, DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2]) end -function change_axes(a::MatrixBlock{<:DistributedAllocationCOO}, - axes::Tuple{<:Vector,<:Vector}) +function change_axes( + a::MatrixBlock{<:DistributedAllocationCOO}, + axes::Tuple{<:Vector,<:Vector} +) block_ids = CartesianIndices(a.array) rows, cols = axes array = map(block_ids) do I @@ -485,9 +490,9 @@ function local_views(a::DistributedAllocationCOO) a.allocs end -function local_views(a::DistributedAllocationCOO,test_dofs_gids_prange,trial_dofs_gids_prange) - @check test_dofs_gids_prange === a.test_dofs_gids_prange - @check trial_dofs_gids_prange === a.trial_dofs_gids_prange +function local_views(a::DistributedAllocationCOO,test_gids,trial_gids) + @check test_gids === a.test_gids + @check trial_gids === a.trial_gids a.allocs end @@ -508,8 +513,8 @@ function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO}) return tuple_of_array_of_parrays end -get_test_gids(a::DistributedAllocationCOO) = a.test_dofs_gids_prange -get_trial_gids(a::DistributedAllocationCOO) = a.trial_dofs_gids_prange +get_test_gids(a::DistributedAllocationCOO) = a.test_gids +get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_test_gids,diag(a.array)) get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array)) @@ -536,31 +541,24 @@ function _fa_create_from_nz_with_callback(callback,a) # Recover some data I,J,V = get_allocations(a) - test_dofs_gids_prange = get_test_gids(a) - trial_dofs_gids_prange = get_trial_gids(a) + test_gids = get_test_gids(a) + trial_gids = get_trial_gids(a) - rows = _setup_prange(test_dofs_gids_prange,I;ghost=false,ax=:rows) + rows = _setup_prange(test_gids,I;ghost=false,ax=:rows) b = callback(rows) # convert I and J to global dof ids - to_global_indices!(I,test_dofs_gids_prange;ax=:rows) - to_global_indices!(J,trial_dofs_gids_prange;ax=:cols) + to_global_indices!(I,test_gids;ax=:rows) + to_global_indices!(J,trial_gids;ax=:cols) # Create the range for cols - cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols) + cols = _setup_prange(trial_gids,J;ax=:cols) # Convert again I,J to local numeration to_local_indices!(I,rows;ax=:rows) to_local_indices!(J,cols;ax=:cols) - # Adjust local matrix size to linear system's index sets - asys = change_axes(a,(rows,cols)) - - # Compress local portions - values = map(create_from_nz,local_views(asys)) - - # Finally build the matrix - A = _setup_matrix(values,rows,cols) + A = _setup_matrix(I,J,V,rows,cols) return A, b end @@ -579,19 +577,19 @@ end function _sa_create_from_nz_with_callback(callback,async_callback,a,b) # Recover some data I,J,V = get_allocations(a) - test_dofs_gids_prange = get_test_gids(a) - trial_dofs_gids_prange = get_trial_gids(a) + test_gids = get_test_gids(a) + trial_gids = get_trial_gids(a) # convert I and J to global dof ids - to_global_indices!(I,test_dofs_gids_prange;ax=:rows) - to_global_indices!(J,trial_dofs_gids_prange;ax=:cols) + to_global_indices!(I,test_gids;ax=:rows) + to_global_indices!(J,trial_gids;ax=:cols) # Create the Prange for the rows - rows = _setup_prange(test_dofs_gids_prange,I;ax=:rows) + rows = _setup_prange(test_gids,I;ax=:rows) # Move (I,J,V) triplets to the owner process of each row I. # The triplets are accompanyed which Jo which is the process column owner - Jo = get_gid_owners(J,trial_dofs_gids_prange;ax=:cols) + Jo = get_gid_owners(J,trial_gids;ax=:cols) t = _assemble_coo!(I,J,V,rows;owners=Jo) # Here we can overlap computations @@ -606,7 +604,7 @@ function _sa_create_from_nz_with_callback(callback,async_callback,a,b) wait(t) # Create the Prange for the cols - cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols,owners=Jo) + cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo) # Overlap rhs communications with CSC compression t2 = async_callback(b) @@ -615,19 +613,13 @@ function _sa_create_from_nz_with_callback(callback,async_callback,a,b) to_local_indices!(I,rows;ax=:rows) to_local_indices!(J,cols;ax=:cols) - # Adjust local matrix size to linear system's index sets - asys = change_axes(a,(rows,cols)) - - # Compress the local matrices - values = map(create_from_nz,local_views(asys)) + A = _setup_matrix(I,J,V,rows,cols) # Wait the transfer to finish if !isa(t2,Nothing) wait(t2) end - - # Finally build the matrix - A = _setup_matrix(values,rows,cols) + return A, b end @@ -657,7 +649,7 @@ end struct PVectorCounter{A,B,C} par_strategy::A counters::B - test_dofs_gids_prange::C + test_gids::C end Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop() @@ -667,12 +659,12 @@ function local_views(a::PVectorCounter) end function local_views(a::PVectorCounter,rows) - @check rows === a.test_dofs_gids_prange + @check rows === a.test_gids a.counters end function Arrays.nz_allocation(a::PVectorCounter{<:FullyAssembledRows}) - dofs = a.test_dofs_gids_prange + dofs = a.test_gids values = map(nz_allocation,a.counters) PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs) end @@ -680,7 +672,7 @@ end struct PVectorAllocationTrackOnlyValues{A,B,C} par_strategy::A values::B - test_dofs_gids_prange::C + test_gids::C end function local_views(a::PVectorAllocationTrackOnlyValues) @@ -688,12 +680,12 @@ function local_views(a::PVectorAllocationTrackOnlyValues) end function local_views(a::PVectorAllocationTrackOnlyValues,rows) - @check rows === a.test_dofs_gids_prange + @check rows === a.test_gids a.values end function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows}) - rows = _setup_prange_without_ghosts(a.test_dofs_gids_prange) + rows = _setup_prange_without_ghosts(a.test_gids) _rhs_callback(a,rows) end @@ -707,7 +699,7 @@ function _rhs_callback(row_partitioned_vector_partition,rows) # The ghost values in row_partitioned_vector_partition are # aligned with the FESpace but not with the ghost values in the rows of A b_fespace = PVector(row_partitioned_vector_partition.values, - partition(row_partitioned_vector_partition.test_dofs_gids_prange)) + partition(row_partitioned_vector_partition.test_gids)) # This one is aligned with the rows of A b = similar(b_fespace,eltype(b_fespace),(rows,)) @@ -758,7 +750,7 @@ end struct PVectorAllocationTrackTouchedAndValues{A,B,C} allocations::A values::B - test_dofs_gids_prange::C + test_gids::C end function Algebra.create_from_nz( @@ -787,7 +779,7 @@ Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Grida function local_views(a::PVectorAllocationTrackTouchedAndValues,rows) - @check rows === a.test_dofs_gids_prange + @check rows === a.test_gids a.allocations end @@ -835,7 +827,7 @@ end function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows}, b::PVectorCounter{<:SubAssembledRows}) A = nz_allocation(a) - dofs = b.test_dofs_gids_prange + dofs = b.test_gids values = map(nz_allocation,b.counters) touched,allocations=_setup_touched_and_allocations_arrays(values) B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs) @@ -843,7 +835,7 @@ function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows}, end function Arrays.nz_allocation(a::PVectorCounter{<:SubAssembledRows}) - dofs = a.test_dofs_gids_prange + dofs = a.test_gids values = map(nz_allocation,a.counters) touched,allocations=_setup_touched_and_allocations_arrays(values) return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs) @@ -857,7 +849,7 @@ function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedA # Find the ghost rows allocations = local_views(a.allocations) - indices = partition(a.test_dofs_gids_prange) + indices = partition(a.test_gids) I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices dofs_lids_touched = findall(allocation.touched) loc_to_gho = local_to_ghost(indices) @@ -877,7 +869,7 @@ function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedA ghost_to_global, ghost_to_owner = map( find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays - ngids = length(a.test_dofs_gids_prange) + ngids = length(a.test_gids) _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner) end @@ -996,15 +988,23 @@ end # _setup_matrix : local matrices + global PRanges -> Global matrix -function _setup_matrix(values,rows::PRange,cols::PRange) - return PSparseMatrix(values,partition(rows),partition(cols)) -end - -function _setup_matrix(values,rows::Vector{<:PRange},cols::Vector{<:PRange}) - block_ids = CartesianIndices((length(rows),length(cols))) - block_mats = map(block_ids) do I - block_values = map(v -> blocks(v)[I],values) - return _setup_matrix(block_values,rows[I[1]],cols[I[2]]) +function _setup_matrix(I,J,V,rows::PRange,cols::PRange) + assembled_rows = map(PArrays.remove_ghost,partition(rows)) + psparse( + I,J,V,assembled_rows,partition(cols); + split_format=true, + assembled=true, + assemble=true, + indices=:local, + reuse=false, + ) |> fetch +end + +function _setup_matrix(I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange}) + block_ids = CartesianIndices(I) + block_mats = map(block_ids) do id + i = id[1]; j = id[2]; + _setup_matrix(I[i,j],J[i,j],V[i,j],rows[i],cols[j]) end return mortar(block_mats) end @@ -1116,7 +1116,7 @@ function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown) end end -# dofs_gids_prange can be either test_dofs_gids_prange or trial_dofs_gids_prange +# dofs_gids_prange can be either test_gids or trial_gids # In the former case, gids is a vector of global test dof identifiers, while in the # latter, a vector of global trial dof identifiers function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...) diff --git a/src/BlockPartitionedArrays.jl b/src/BlockPartitionedArrays.jl index 41e3f6dc..942a7c28 100644 --- a/src/BlockPartitionedArrays.jl +++ b/src/BlockPartitionedArrays.jl @@ -271,8 +271,8 @@ function PartitionedArrays.partition(a::BlockPArray) return map(mortar,vals) end -function PartitionedArrays.to_trivial_partition(a::BlockPArray) - vals = map(PartitionedArrays.to_trivial_partition,blocks(a)) +function PartitionedArrays.centralize(a::BlockPArray) + vals = map(PartitionedArrays.centralize,blocks(a)) return mortar(vals) end diff --git a/src/FESpaces.jl b/src/FESpaces.jl index c2b09475..844ec9e6 100644 --- a/src/FESpaces.jl +++ b/src/FESpaces.jl @@ -118,7 +118,7 @@ end function fetch_vector_ghost_values_cache(vector_partition,partition) cache = PArrays.p_vector_cache(vector_partition,partition) - map(reverse,cache) + reverse(cache) end function fetch_vector_ghost_values!(vector_partition,cache) @@ -163,7 +163,7 @@ function generate_gids( # swapped in the AssemblyCache of partition(cell_range) # Exchange the dof owners - cache_fetch=fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range)) + cache_fetch = fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range)) fetch_vector_ghost_values!(cell_ldofs_to_part,cache_fetch) |> wait cell_wise_to_dof_wise!(ldof_to_owner, diff --git a/test/BlockPartitionedArraysTests.jl b/test/BlockPartitionedArraysTests.jl index 3e810085..5ec6579c 100644 --- a/test/BlockPartitionedArraysTests.jl +++ b/test/BlockPartitionedArraysTests.jl @@ -76,7 +76,7 @@ t = assemble!(__v) assemble!(__m) |> wait fetch(t); -PartitionedArrays.to_trivial_partition(m) +PartitionedArrays.centralize(m) partition(v) partition(m) diff --git a/test/parrays_update.jl b/test/parrays_update.jl new file mode 100644 index 00000000..951a68d7 --- /dev/null +++ b/test/parrays_update.jl @@ -0,0 +1,36 @@ + +using Gridap +using PartitionedArrays +using GridapDistributed + +np = (1,2) +ranks = with_debug() do distribute + distribute(LinearIndices((prod(np),))) +end + +nc = (2,4) +serial_model = CartesianDiscreteModel((0,1,0,1),nc) +dist_model = CartesianDiscreteModel(ranks,np,(0,1,0,1),nc) + +cids = get_cell_gids(dist_model) + +reffe = ReferenceFE(lagrangian,Float64,1) +serial_V = TestFESpace(serial_model,reffe) +dist_V = TestFESpace(dist_model,reffe) + +serial_ids = get_free_dof_ids(serial_V) +dist_ids = get_free_dof_ids(dist_V) + +serial_Ω = Triangulation(serial_model) +serial_dΩ = Measure(serial_Ω,2) + +dist_Ω = Triangulation(dist_model) +dist_dΩ = Measure(dist_Ω,2) + +serial_a1(u,v) = ∫(u⋅v)*serial_dΩ +serial_A1 = assemble_matrix(serial_a1,serial_V,serial_V) + +dist_a1(u,v) = ∫(u⋅v)*dist_dΩ +dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V) +all(centralize(dist_A1) - serial_A1 .< 1e-10) + From e17e85ce107030fcfdc3fafd8ae386a46176eb8f Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Mon, 2 Sep 2024 10:37:17 +1000 Subject: [PATCH 03/14] FESpace gids are now permuted LocalIndicesWithVariableBlockSize --- src/DistributedUtils.jl | 44 +++++++ src/FESpaces.jl | 240 ++++++++++++++++++--------------------- src/GridapDistributed.jl | 6 +- 3 files changed, 156 insertions(+), 134 deletions(-) create mode 100644 src/DistributedUtils.jl diff --git a/src/DistributedUtils.jl b/src/DistributedUtils.jl new file mode 100644 index 00000000..0b34a0a1 --- /dev/null +++ b/src/DistributedUtils.jl @@ -0,0 +1,44 @@ + +function permuted_variable_partition( + n_own::AbstractArray{<:Integer}, + gids::AbstractArray{<:AbstractArray{<:Integer}}, + owners::AbstractArray{<:AbstractArray{<:Integer}}; + n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))), + start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own))) +) + ranks = linear_indices(n_own) + np = length(ranks) + map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners + n_local = length(gids) + n_ghost = n_local - n_own + perm = fill(zero(Int32),n_local) + ghost_gids = fill(zero(Int),n_ghost) + ghost_owners = fill(zero(Int32),n_ghost) + + n_ghost = 0 + for (lid,(gid,owner)) in enumerate(zip(gids,owners)) + if owner == rank + perm[lid] = gid-start+1 + else + n_ghost += 1 + ghost_gids[n_ghost] = gid + ghost_owners[n_ghost] = owner + perm[lid] = n_own + n_ghost + end + end + @assert n_ghost == n_local - n_own + + ghost = GhostIndices(n_global,ghost_gids,ghost_owners) + dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize( + CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost + ) + permute_indices(dof_ids,perm) + end +end + +function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T + ptrs = Vector{Int32}(undef,length(vv)+1) + Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv) + Arrays.length_to_ptrs!(ptrs) + return ptrs +end diff --git a/src/FESpaces.jl b/src/FESpaces.jl index 844ec9e6..02530cfa 100644 --- a/src/FESpaces.jl +++ b/src/FESpaces.jl @@ -56,36 +56,25 @@ function FESpaces.gather_free_and_dirichlet_values(f::DistributedFESpace,cell_va return free_values, dirichlet_values end -function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_prange) - map(cell_wise_vector, - dof_wise_vector, - cell_to_ldofs, - partition(cell_prange)) do cwv,dwv,cell_to_ldofs,indices +function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) + map(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids cache = array_cache(cell_to_ldofs) - ncells = length(cell_to_ldofs) - ptrs = cwv.ptrs - data = cwv.data - cell_own_to_local = own_to_local(indices) - for cell in cell_own_to_local + for cell in cell_ids ldofs = getindex!(cache,cell_to_ldofs,cell) - p = ptrs[cell]-1 + p = cwv.ptrs[cell]-1 for (i,ldof) in enumerate(ldofs) if ldof > 0 - data[i+p] = dwv[ldof] + cwv.data[i+p] = dwv[ldof] end end end end end -function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_range) - map(dof_wise_vector, - cell_wise_vector, - cell_to_ldofs, - partition(cell_range)) do dwv,cwv,cell_to_ldofs,indices +function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) + map(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids cache = array_cache(cell_to_ldofs) - cell_ghost_to_local = ghost_to_local(indices) - for cell in cell_ghost_to_local + for cell in cell_ids ldofs = getindex!(cache,cell_to_ldofs,cell) p = cwv.ptrs[cell]-1 for (i,ldof) in enumerate(ldofs) @@ -97,23 +86,14 @@ function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,c end end -function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_prange) - cwv=map(cell_to_ldofs) do cell_to_ldofs - cache = array_cache(cell_to_ldofs) - ncells = length(cell_to_ldofs) - ptrs = Vector{Int32}(undef,ncells+1) - for cell in 1:ncells - ldofs = getindex!(cache,cell_to_ldofs,cell) - ptrs[cell+1] = length(ldofs) - end - PArrays.length_to_ptrs!(ptrs) - ndata = ptrs[end]-1 - data = Vector{Int}(undef,ndata) - data .= -1 - JaggedArray(data,ptrs) - end - dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_prange) - cwv +function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_ids) + cwv = map(cell_to_ldofs) do cell_to_ldofs + ptrs = generate_ptrs(cell_to_ldofs) + data = fill(-1,ptrs[end]-1) + JaggedArray(data,ptrs) + end + dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_ids) + return cwv end function fetch_vector_ghost_values_cache(vector_partition,partition) @@ -128,13 +108,15 @@ end function generate_gids( cell_range::PRange, cell_to_ldofs::AbstractArray{<:AbstractArray}, - nldofs::AbstractArray{<:Integer}) + nldofs::AbstractArray{<:Integer} +) + ranks = linear_indices(partition(cell_range)) # Find and count number owned dofs ldof_to_owner, nodofs = map(partition(cell_range),cell_to_ldofs,nldofs) do indices,cell_to_ldofs,nldofs ldof_to_owner = fill(Int32(0),nldofs) - cache = array_cache(cell_to_ldofs) lcell_to_owner = local_to_owner(indices) + cache = array_cache(cell_to_ldofs) for cell in 1:length(cell_to_ldofs) owner = lcell_to_owner[cell] ldofs = getindex!(cache,cell_to_ldofs,cell) @@ -151,102 +133,54 @@ function generate_gids( ldof_to_owner, nodofs end |> tuple_of_arrays - cell_ldofs_to_part = dof_wise_to_cell_wise(ldof_to_owner, - cell_to_ldofs, - cell_range) - - # Note1 : this call potentially updates cell_prange with the - # info required to exchange info among nearest neighbours - # so that it can be re-used in the future for other exchanges - - # Note2 : we need to call reverse() as the senders and receivers are - # swapped in the AssemblyCache of partition(cell_range) - - # Exchange the dof owners - cache_fetch = fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range)) - fetch_vector_ghost_values!(cell_ldofs_to_part,cache_fetch) |> wait - - cell_wise_to_dof_wise!(ldof_to_owner, - cell_ldofs_to_part, - cell_to_ldofs, - cell_range) - # Find the global range of owned dofs first_gdof = scan(+,nodofs,type=:exclusive,init=one(eltype(nodofs))) + + # Exchange the dof owners. Cell owner always has correct dof owner. + cell_ldofs_to_owner = dof_wise_to_cell_wise(ldof_to_owner,cell_to_ldofs,own_to_local(cell_range)) + consistent!(PVector(cell_ldofs_to_owner,partition(cell_range))) |> wait + cell_wise_to_dof_wise!(ldof_to_owner,cell_ldofs_to_owner,cell_to_ldofs,ghost_to_local(cell_range)) - # Distribute gdofs to owned ones - ldof_to_gdof = map(first_gdof,ldof_to_owner,partition(cell_range)) do first_gdof,ldof_to_owner,indices - me = part_id(indices) + # Fill owned gids + ldof_to_gdof = map(ranks,first_gdof,ldof_to_owner) do rank,first_gdof,ldof_to_owner + ldof_to_gdof = fill(0,length(ldof_to_owner)) offset = first_gdof-1 - ldof_to_gdof = Vector{Int}(undef,length(ldof_to_owner)) odof = 0 for (ldof,owner) in enumerate(ldof_to_owner) - if owner == me + if owner == rank odof += 1 - ldof_to_gdof[ldof] = odof - else - ldof_to_gdof[ldof] = 0 - end - end - for (ldof,owner) in enumerate(ldof_to_owner) - if owner == me - ldof_to_gdof[ldof] += offset + ldof_to_gdof[ldof] = odof + offset end end ldof_to_gdof end - # Create cell-wise global dofs - cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof, - cell_to_ldofs, - cell_range) + # Exchange gids + cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range)) + consistent!(PVector(cell_to_gdofs,partition(cell_range))) |> wait - # Exchange the global dofs - fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait - - # Distribute global dof ids also to ghost + # Fill ghost gids with exchanged information map(cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices - gdof = 0 cache = array_cache(cell_to_ldofs) - cell_ghost_to_local = ghost_to_local(indices) cell_local_to_owner = local_to_owner(indices) - for cell in cell_ghost_to_local + for cell in ghost_to_local(indices) ldofs = getindex!(cache,cell_to_ldofs,cell) + cell_owner = cell_local_to_owner[cell] p = cell_to_gdofs.ptrs[cell]-1 for (i,ldof) in enumerate(ldofs) - if ldof > 0 && ldof_to_owner[ldof] == cell_local_to_owner[cell] + dof_owner = ldof_to_owner[ldof] + if ldof > 0 && (dof_owner == cell_owner) ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p] end end end end - dof_wise_to_cell_wise!(cell_to_gdofs, - ldof_to_gdof, - cell_to_ldofs, - cell_range) - - fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait - - cell_wise_to_dof_wise!(ldof_to_gdof, - cell_to_gdofs, - cell_to_ldofs, - cell_range) - - # Setup DoFs LocalIndices - ngdofs = reduction(+,nodofs,destination=:all,init=zero(eltype(nodofs))) - local_indices = map(ngdofs,partition(cell_range),ldof_to_gdof,ldof_to_owner) do ngdofs, - indices, - ldof_to_gdof, - ldof_to_owner - me = part_id(indices) - LocalIndices(ngdofs,me,ldof_to_gdof,ldof_to_owner) - end - - # Setup dof range - dofs_range = PRange(local_indices) - - return dofs_range + # Setup DoFs AbstractLocalIndices + dof_ids = permuted_variable_partition( + nodofs,ldof_to_gdof,ldof_to_owner;start=first_gdof + ) + return PRange(dof_ids) end # FEFunction related @@ -576,12 +510,14 @@ end function FESpaces.collect_cell_matrix( trial::DistributedFESpace, test::DistributedFESpace, - a::DistributedDomainContribution) + a::DistributedDomainContribution +) map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a)) end function FESpaces.collect_cell_vector( - test::DistributedFESpace, a::DistributedDomainContribution) + test::DistributedFESpace, a::DistributedDomainContribution +) map(collect_cell_vector,local_views(test),local_views(a)) end @@ -589,12 +525,14 @@ function FESpaces.collect_cell_matrix_and_vector( trial::DistributedFESpace, test::DistributedFESpace, biform::DistributedDomainContribution, - liform::DistributedDomainContribution) + liform::DistributedDomainContribution +) map(collect_cell_matrix_and_vector, local_views(trial), local_views(test), local_views(biform), - local_views(liform)) + local_views(liform) + ) end function FESpaces.collect_cell_matrix_and_vector( @@ -602,17 +540,20 @@ function FESpaces.collect_cell_matrix_and_vector( test::DistributedFESpace, biform::DistributedDomainContribution, liform::DistributedDomainContribution, - uhd) + uhd +) map(collect_cell_matrix_and_vector, local_views(trial), local_views(test), local_views(biform), local_views(liform), - local_views(uhd)) + local_views(uhd) + ) end function FESpaces.collect_cell_vector( - test::DistributedFESpace,l::Number) + test::DistributedFESpace,l::Number +) map(local_views(test)) do s collect_cell_vector(s,l) end @@ -622,7 +563,8 @@ function FESpaces.collect_cell_matrix_and_vector( trial::DistributedFESpace, test::DistributedFESpace, mat::DistributedDomainContribution, - l::Number) + l::Number +) map(local_views(trial),local_views(test),local_views(mat)) do u,v,m collect_cell_matrix_and_vector(u,v,m,l) end @@ -633,26 +575,29 @@ function FESpaces.collect_cell_matrix_and_vector( test::DistributedFESpace, mat::DistributedDomainContribution, l::Number, - uhd) + uhd +) map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f collect_cell_matrix_and_vector(u,v,m,l,f) end end + """ """ -struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler +struct DistributedSparseMatrixAssembler{A,B,C,D,E,F,G} <: SparseMatrixAssembler strategy::A assems::B matrix_builder::C vector_builder::D - test_dofs_gids_prange::E - trial_dofs_gids_prange::F + test_gids::E + trial_gids::F + caches::G end local_views(a::DistributedSparseMatrixAssembler) = a.assems -FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_dofs_gids_prange -FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_dofs_gids_prange +FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids +FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy @@ -694,20 +639,43 @@ function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrix map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) end +function FESpaces.assemble_matrix!(mat::PSparseMatrix,a::DistributedSparseMatrixAssembler,matdata) + @check !isnothing(a.caches) "Assembler is not reusable!" + @check haskey(a.caches,objectid(mat)) "Cache not found!" + cache = a.caches[objectid(mat)] + + LinearAlgebra.fillstored!(mat,zero(eltype(mat))) + + counter = nz_counter(get_matrix_builder(a),(get_rows(a),get_cols(a))) + map(local_views(counter),partition(mat)) do counter, mat + counter.nnz = nnz(mat) + end + allocs = nz_allocation(counter) + numeric_loop_matrix!(allocs,a,matdata) + + create_from_nz!(mat,allocs,cache) +end + +function create_from_nz!(A,a::DistributedAllocationCOO{<:Assembled},cache) + _,_,V = get_allocations(a) + psparse!(A,V,cache) |> wait + return A +end + # Parallel Assembly strategies -function local_assembly_strategy(::SubAssembledRows,rows,cols) +function local_assembly_strategy(::Assembled,rows,cols) DefaultAssemblyStrategy() end # When using this one, make sure that you also loop over ghost cells. # This is at your own risk. -function local_assembly_strategy(::FullyAssembledRows,rows,cols) +function local_assembly_strategy(::LocallyAssembled,rows,cols) rows_local_to_ghost = local_to_ghost(rows) GenericAssemblyStrategy( identity, identity, - row->rows_local_to_ghost[row]==0, + row->iszero(rows_local_to_ghost[row]), col->true ) end @@ -718,7 +686,8 @@ function FESpaces.SparseMatrixAssembler( local_vec_type, rows::PRange, cols::PRange, - par_strategy=SubAssembledRows() + par_strategy=Assembled(); + reuse_caches=false ) assems = map(partition(rows),partition(cols)) do rows,cols local_strategy = local_assembly_strategy(par_strategy,rows,cols) @@ -732,7 +701,12 @@ function FESpaces.SparseMatrixAssembler( end mat_builder = PSparseMatrixBuilderCOO(local_mat_type,par_strategy) vec_builder = PVectorBuilder(local_vec_type,par_strategy) - return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols) + if reuse_caches + caches = Dict{UInt64,Any}() + else + caches = nothing + end + return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols,caches) end function FESpaces.SparseMatrixAssembler( @@ -740,22 +714,24 @@ function FESpaces.SparseMatrixAssembler( local_vec_type, trial::DistributedFESpace, test::DistributedFESpace, - par_strategy=SubAssembledRows() + par_strategy::FESpaces.AssemblyStrategy=Assembled(); + kwargs... ) rows = get_free_dof_ids(test) cols = get_free_dof_ids(trial) - SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy) + SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...) end function FESpaces.SparseMatrixAssembler( trial::DistributedFESpace, test::DistributedFESpace, - par_strategy=SubAssembledRows() + par_strategy::FESpaces.AssemblyStrategy=Assembled(); + kwargs... ) Tv = PartitionedArrays.getany(map(get_vector_type,local_views(trial))) T = eltype(Tv) Tm = SparseMatrixCSC{T,Int} - SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy) + SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...) end # ZeroMean FESpace diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl index 8c20a975..7edf2a48 100644 --- a/src/GridapDistributed.jl +++ b/src/GridapDistributed.jl @@ -31,8 +31,8 @@ import Base: inv, abs, abs2, *, +, -, /, adjoint, transpose, real, imag, conj, g import Gridap.Fields: grad2curl import Gridap.CellData: Interpolable -export FullyAssembledRows -export SubAssembledRows +export LocallyAssembled +export Assembled export get_cell_gids export get_face_gids @@ -42,6 +42,8 @@ export with_ghost, no_ghost export redistribute +include("DistributedUtils.jl") + include("BlockPartitionedArrays.jl") include("Algebra.jl") From aab58d445844909236f3b3f1852261f334295fbf Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Sat, 7 Sep 2024 00:18:42 +1000 Subject: [PATCH 04/14] Started re-structuring assembly --- src/Algebra.jl | 1143 -------------------- src/Assembly.jl | 199 ++++ src/DistributedUtils.jl | 44 - src/Geometry.jl | 4 +- src/GridapDistributed.jl | 5 +- src/GridapExtras.jl | 55 + src/MultiField.jl | 2 +- src/PArraysExtras.jl | 210 ++++ src/_Algebra_old.jl | 1238 ++++++++++++++++++++++ test/BlockSparseMatrixAssemblersTests.jl | 4 +- test/DivConformingTests.jl | 2 +- test/FESpacesTests.jl | 4 +- test/HeatEquationTests.jl | 2 +- test/PLaplacianTests.jl | 4 +- test/parrays_update.jl | 51 + 15 files changed, 1768 insertions(+), 1199 deletions(-) create mode 100644 src/Assembly.jl delete mode 100644 src/DistributedUtils.jl create mode 100644 src/GridapExtras.jl create mode 100644 src/PArraysExtras.jl create mode 100644 src/_Algebra_old.jl diff --git a/src/Algebra.jl b/src/Algebra.jl index 11af9149..3fa0b502 100644 --- a/src/Algebra.jl +++ b/src/Algebra.jl @@ -29,192 +29,6 @@ function Algebra.allocate_in_domain(matrix::BlockPMatrix) allocate_in_domain(BlockPVector{V},matrix) end -# PSparseMatrix copy - -function Base.copy(a::PSparseMatrix) - mats = map(copy,partition(a)) - cache = map(PartitionedArrays.copy_cache,a.cache) - return PSparseMatrix(mats,partition(axes(a,1)),partition(axes(a,2)),cache) -end - -# PartitionedArrays extras - -function LinearAlgebra.axpy!(α,x::PVector,y::PVector) - @check partition(axes(x,1)) === partition(axes(y,1)) - map(partition(x),partition(y)) do x,y - LinearAlgebra.axpy!(α,x,y) - end - consistent!(y) |> wait - return y -end - -function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector) - map(blocks(x),blocks(y)) do x,y - LinearAlgebra.axpy!(α,x,y) - end - return y -end - -function Algebra.axpy_entries!( - α::Number, A::PSparseMatrix, B::PSparseMatrix; - check::Bool=true -) -# We should definitely check here that the index partitions are the same. -# However: Because the different matrices are assembled separately, the objects are not the -# same (i.e can't use ===). Checking the index partitions would then be costly... - @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1)))) - @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2)))) - map(partition(A),partition(B)) do A, B - Algebra.axpy_entries!(α,A,B;check) - end - return B -end - -function Algebra.axpy_entries!( - α::Number, A::BlockPMatrix, B::BlockPMatrix; - check::Bool=true -) - map(blocks(A),blocks(B)) do A, B - Algebra.axpy_entries!(α,A,B;check) - end - return B -end - -# This might go to Gridap in the future. We keep it here for the moment. -function change_axes(a::Algebra.ArrayCounter,axes) - @notimplemented -end - -# This might go to Gridap in the future. We keep it here for the moment. -function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A} - b = Algebra.CounterCOO{T}(axes) - b.nnz = a.nnz - b -end - -# This might go to Gridap in the future. We keep it here for the moment. -function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A} - counter = change_axes(a.counter,axes) - Algebra.AllocationCOO(counter,a.I,a.J,a.V) -end - -# Array of PArrays -> PArray of Arrays -function to_parray_of_arrays(a::AbstractArray{<:MPIArray}) - indices = linear_indices(first(a)) - map(indices) do i - map(a) do aj - PartitionedArrays.getany(aj) - end - end -end - -function to_parray_of_arrays(a::AbstractArray{<:DebugArray}) - indices = linear_indices(first(a)) - map(indices) do i - map(a) do aj - aj.items[i] - end - end -end - -# This type is required because MPIArray from PArrays -# cannot be instantiated with a NULL communicator -struct MPIVoidVector{T} <: AbstractVector{T} - comm::MPI.Comm - function MPIVoidVector(::Type{T}) where {T} - new{T}(MPI.COMM_NULL) - end -end - -Base.size(a::MPIVoidVector) = (0,) -Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear() -function Base.getindex(a::MPIVoidVector,i::Int) - error("Indexing of MPIVoidVector not possible.") -end -function Base.setindex!(a::MPIVoidVector,v,i::Int) - error("Indexing of MPIVoidVector not possible.") -end -function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector) - println(io,"MPIVoidVector") -end - -function num_parts(comm::MPI.Comm) - if comm != MPI.COMM_NULL - nparts = MPI.Comm_size(comm) - else - nparts = -1 - end - nparts -end -@inline num_parts(comm::MPIArray) = num_parts(comm.comm) -@inline num_parts(comm::DebugArray) = length(comm.items) -@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm) - -function get_part_id(comm::MPI.Comm) - if comm != MPI.COMM_NULL - id = MPI.Comm_rank(comm)+1 - else - id = -1 - end - id -end -@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm) -@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm) - -""" - i_am_in(comm::MPIArray) - i_am_in(comm::DebugArray) - - Returns `true` if the processor is part of the subcommunicator `comm`. -""" -function i_am_in(comm::MPI.Comm) - get_part_id(comm) >=0 -end -@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm) -@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm) -@inline i_am_in(comm::DebugArray) = true - -function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing) - x_new = map(new_parts) do p - if isa(x,MPIArray) - PartitionedArrays.getany(x) - elseif isa(x,DebugArray) && (p <= length(x.items)) - x.items[p] - else - default - end - end - return x_new -end - -function generate_subparts(parts::MPIArray,new_comm_size) - root_comm = parts.comm - root_size = MPI.Comm_size(root_comm) - rank = MPI.Comm_rank(root_comm) - - @static if isdefined(MPI,:MPI_UNDEFINED) - mpi_undefined = MPI.MPI_UNDEFINED[] - else - mpi_undefined = MPI.API.MPI_UNDEFINED[] - end - - if root_size == new_comm_size - return parts - else - if rank < new_comm_size - comm = MPI.Comm_split(root_comm,0,0) - return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false) - else - comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined) - return MPIVoidVector(eltype(parts)) - end - end -end - -function generate_subparts(parts::DebugArray,new_comm_size) - DebugArray(LinearIndices((new_comm_size,))) -end - # local_views function local_views(a) @@ -284,960 +98,3 @@ function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_ end return BlockPVector(vals,ids) end - -# This function computes a mapping among the local identifiers of a and b -# for which the corresponding global identifiers are both in a and b. -# Note that the haskey check is necessary because in the general case -# there might be gids in b which are not present in a -function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices) - a_local_to_b_local = fill(Int32(-1),local_length(a)) - b_local_to_global = local_to_global(b) - a_global_to_local = global_to_local(a) - for blid in 1:local_length(b) - gid = b_local_to_global[blid] - if a_global_to_local[gid] != zero(eltype(a_global_to_local)) - alid = a_global_to_local[gid] - a_local_to_b_local[alid] = blid - end - end - a_local_to_b_local -end - -# This type is required in order to be able to access the local portion -# of distributed sparse matrices and vectors using local indices from the -# distributed test and trial spaces -struct LocalView{T,N,A,B} <:AbstractArray{T,N} - plids_to_value::A - d_to_lid_to_plid::B - local_size::NTuple{N,Int} - function LocalView( - plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N} - A = typeof(plids_to_value) - B = typeof(d_to_lid_to_plid) - local_size = map(length,d_to_lid_to_plid) - new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size) - end -end - -Base.size(a::LocalView) = a.local_size -Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian() -function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N} - plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid) - if all(i->i>0,plids) - a.plids_to_value[plids...] - else - zero(T) - end -end -function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N} - plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid) - @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion" - a.plids_to_value[plids...] = v -end - -function _lid_to_plid(lid,lid_to_plid) - plid = lid_to_plid[lid] - plid -end - -function local_views(a::PVector,new_rows::PRange) - old_rows = axes(a,1) - if partition(old_rows) === partition(new_rows) - partition(a) - else - map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows - LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),)) - end - end -end - -function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange) - old_rows, old_cols = axes(a) - if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) ) - partition(a) - else - map(partition(a), - partition(old_rows),partition(old_cols), - partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols - rl2lmap = find_local_to_local_map(new_rows,old_rows) - cl2lmap = find_local_to_local_map(new_cols,old_cols) - LocalView(matrix_partition,(rl2lmap,cl2lmap)) - end - end -end - -function local_views(a::BlockPVector,new_rows::BlockPRange) - vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays - return map(mortar,vals) -end - -function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange) - vals = map(CartesianIndices(blocksize(a))) do I - local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I]) - end |> to_parray_of_arrays - return map(mortar,vals) -end - -# PSparseMatrix assembly - -struct FullyAssembledRows end -struct SubAssembledRows end - -# For the moment we use COO format even though -# it is quite memory consuming. -# We need to implement communication in other formats in -# PartitionedArrays.jl - -struct PSparseMatrixBuilderCOO{T,B} - local_matrix_type::Type{T} - par_strategy::B -end - -function Algebra.nz_counter( - builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange} -) where A - rows, cols = axs # test ids, trial ids - counters = map(partition(rows),partition(cols)) do r,c - axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c))) - Algebra.CounterCOO{A}(axs) - end - DistributedCounterCOO(builder.par_strategy,counters,rows,cols) -end - -function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv - T = eltype(Tv) - return PSparseMatrix{T} -end - - -""" -""" -struct DistributedCounterCOO{A,B,C,D} <: GridapType - par_strategy::A - counters::B - test_gids::C - trial_gids::D - function DistributedCounterCOO( - par_strategy, - counters::AbstractArray{<:Algebra.CounterCOO}, - test_gids::PRange, - trial_gids::PRange) - A = typeof(par_strategy) - B = typeof(counters) - C = typeof(test_gids) - D = typeof(trial_gids) - new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids) - end -end - -function local_views(a::DistributedCounterCOO) - return a.counters -end - -function local_views(a::DistributedCounterCOO,test_gids,trial_gids) - @check PArrays.matching_local_indices(test_gids,a.test_gids) - @check PArrays.matching_local_indices(trial_gids,a.trial_gids) - return a.counters -end - -function Algebra.nz_allocation(a::DistributedCounterCOO) - allocs = map(nz_allocation,a.counters) - DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids) -end - -struct DistributedAllocationCOO{A,B,C,D} <: GridapType - par_strategy::A - allocs::B - test_gids::C - trial_gids::D - function DistributedAllocationCOO( - par_strategy, - allocs::AbstractArray{<:Algebra.AllocationCOO}, - test_gids::PRange, - trial_gids::PRange) - A = typeof(par_strategy) - B = typeof(allocs) - C = typeof(test_gids) - D = typeof(trial_gids) - new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids) - end -end - -function change_axes( - a::DistributedAllocationCOO{A,B,<:PRange,<:PRange}, - axes::Tuple{<:PRange,<:PRange} -) where {A,B} - local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols - (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols))) - end - allocs = map(change_axes,a.allocs,local_axes) - DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2]) -end - -function change_axes( - a::MatrixBlock{<:DistributedAllocationCOO}, - axes::Tuple{<:Vector,<:Vector} -) - block_ids = CartesianIndices(a.array) - rows, cols = axes - array = map(block_ids) do I - change_axes(a[I],(rows[I[1]],cols[I[2]])) - end - return ArrayBlock(array,a.touched) -end - -function local_views(a::DistributedAllocationCOO) - a.allocs -end - -function local_views(a::DistributedAllocationCOO,test_gids,trial_gids) - @check test_gids === a.test_gids - @check trial_gids === a.trial_gids - a.allocs -end - -function local_views(a::MatrixBlock{<:DistributedAllocationCOO}) - array = map(local_views,a.array) |> to_parray_of_arrays - return map(ai -> ArrayBlock(ai,a.touched),array) -end - -function get_allocations(a::DistributedAllocationCOO) - I,J,V = map(local_views(a)) do alloc - alloc.I, alloc.J, alloc.V - end |> tuple_of_arrays - return I,J,V -end - -function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO}) - tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays - return tuple_of_array_of_parrays -end - -get_test_gids(a::DistributedAllocationCOO) = a.test_gids -get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids -get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_test_gids,diag(a.array)) -get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array)) - -function Algebra.create_from_nz(a::PSparseMatrix) - # For FullyAssembledRows the underlying Exchanger should - # not have ghost layer making assemble! do nothing (TODO check) - assemble!(a) |> wait - a -end - -function Algebra.create_from_nz(a::DistributedAllocationCOO{<:FullyAssembledRows}) - f(x) = nothing - A, = _fa_create_from_nz_with_callback(f,a) - return A -end - -function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:FullyAssembledRows}}) - f(x) = nothing - A, = _fa_create_from_nz_with_callback(f,a) - return A -end - -function _fa_create_from_nz_with_callback(callback,a) - - # Recover some data - I,J,V = get_allocations(a) - test_gids = get_test_gids(a) - trial_gids = get_trial_gids(a) - - rows = _setup_prange(test_gids,I;ghost=false,ax=:rows) - b = callback(rows) - - # convert I and J to global dof ids - to_global_indices!(I,test_gids;ax=:rows) - to_global_indices!(J,trial_gids;ax=:cols) - - # Create the range for cols - cols = _setup_prange(trial_gids,J;ax=:cols) - - # Convert again I,J to local numeration - to_local_indices!(I,rows;ax=:rows) - to_local_indices!(J,cols;ax=:cols) - - A = _setup_matrix(I,J,V,rows,cols) - return A, b -end - -function Algebra.create_from_nz(a::DistributedAllocationCOO{<:SubAssembledRows}) - f(x) = nothing - A, = _sa_create_from_nz_with_callback(f,f,a,nothing) - return A -end - -function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:SubAssembledRows}}) - f(x) = nothing - A, = _sa_create_from_nz_with_callback(f,f,a,nothing) - return A -end - -function _sa_create_from_nz_with_callback(callback,async_callback,a,b) - # Recover some data - I,J,V = get_allocations(a) - test_gids = get_test_gids(a) - trial_gids = get_trial_gids(a) - - # convert I and J to global dof ids - to_global_indices!(I,test_gids;ax=:rows) - to_global_indices!(J,trial_gids;ax=:cols) - - # Create the Prange for the rows - rows = _setup_prange(test_gids,I;ax=:rows) - - # Move (I,J,V) triplets to the owner process of each row I. - # The triplets are accompanyed which Jo which is the process column owner - Jo = get_gid_owners(J,trial_gids;ax=:cols) - t = _assemble_coo!(I,J,V,rows;owners=Jo) - - # Here we can overlap computations - # This is a good place to overlap since - # sending the matrix rows is a lot of data - if !isa(b,Nothing) - bprange=_setup_prange_from_pvector_allocation(b) - b = callback(bprange) - end - - # Wait the transfer to finish - wait(t) - - # Create the Prange for the cols - cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo) - - # Overlap rhs communications with CSC compression - t2 = async_callback(b) - - # Convert again I,J to local numeration - to_local_indices!(I,rows;ax=:rows) - to_local_indices!(J,cols;ax=:cols) - - A = _setup_matrix(I,J,V,rows,cols) - - # Wait the transfer to finish - if !isa(t2,Nothing) - wait(t2) - end - - return A, b -end - - -# PVector assembly - -struct PVectorBuilder{T,B} - local_vector_type::Type{T} - par_strategy::B -end - -function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange}) - T = builder.local_vector_type - rows, = axs - counters = map(partition(rows)) do rows - axs = (Base.OneTo(local_length(rows)),) - nz_counter(ArrayBuilder(T),axs) - end - PVectorCounter(builder.par_strategy,counters,rows) -end - -function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv - T = eltype(Tv) - return PVector{T} -end - -struct PVectorCounter{A,B,C} - par_strategy::A - counters::B - test_gids::C -end - -Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop() - -function local_views(a::PVectorCounter) - a.counters -end - -function local_views(a::PVectorCounter,rows) - @check rows === a.test_gids - a.counters -end - -function Arrays.nz_allocation(a::PVectorCounter{<:FullyAssembledRows}) - dofs = a.test_gids - values = map(nz_allocation,a.counters) - PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs) -end - -struct PVectorAllocationTrackOnlyValues{A,B,C} - par_strategy::A - values::B - test_gids::C -end - -function local_views(a::PVectorAllocationTrackOnlyValues) - a.values -end - -function local_views(a::PVectorAllocationTrackOnlyValues,rows) - @check rows === a.test_gids - a.values -end - -function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows}) - rows = _setup_prange_without_ghosts(a.test_gids) - _rhs_callback(a,rows) -end - -function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:SubAssembledRows}) - # This point MUST NEVER be reached. If reached there is an inconsistency - # in the parallel code in charge of vector assembly - @assert false -end - -function _rhs_callback(row_partitioned_vector_partition,rows) - # The ghost values in row_partitioned_vector_partition are - # aligned with the FESpace but not with the ghost values in the rows of A - b_fespace = PVector(row_partitioned_vector_partition.values, - partition(row_partitioned_vector_partition.test_gids)) - - # This one is aligned with the rows of A - b = similar(b_fespace,eltype(b_fespace),(rows,)) - - # First transfer owned values - b .= b_fespace - - # Now transfer ghost - function transfer_ghost(b,b_fespace,ids,ids_fespace) - num_ghosts_vec = ghost_length(ids) - gho_to_loc_vec = ghost_to_local(ids) - loc_to_glo_vec = local_to_global(ids) - gid_to_lid_fe = global_to_local(ids_fespace) - for ghost_lid_vec in 1:num_ghosts_vec - lid_vec = gho_to_loc_vec[ghost_lid_vec] - gid = loc_to_glo_vec[lid_vec] - lid_fespace = gid_to_lid_fe[gid] - b[lid_vec] = b_fespace[lid_fespace] - end - end - map( - transfer_ghost, - partition(b), - partition(b_fespace), - b.index_partition, - b_fespace.index_partition) - - return b -end - -function Algebra.create_from_nz(a::PVector) - assemble!(a) |> wait - return a -end - -function Algebra.create_from_nz( - a::DistributedAllocationCOO{<:FullyAssembledRows}, - c_fespace::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows}) - - function callback(rows) - _rhs_callback(c_fespace,rows) - end - - A,b = _fa_create_from_nz_with_callback(callback,a) - return A,b -end - -struct PVectorAllocationTrackTouchedAndValues{A,B,C} - allocations::A - values::B - test_gids::C -end - -function Algebra.create_from_nz( - a::DistributedAllocationCOO{<:SubAssembledRows}, - b::PVectorAllocationTrackTouchedAndValues) - - function callback(rows) - _rhs_callback(b,rows) - end - - function async_callback(b) - # now we can assemble contributions - assemble!(b) - end - - A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b) - return A,b -end - -struct ArrayAllocationTrackTouchedAndValues{A} - touched::Vector{Bool} - values::A -end - -Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop() - - -function local_views(a::PVectorAllocationTrackTouchedAndValues,rows) - @check rows === a.test_gids - a.allocations -end - -@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j) - @notimplemented -end -@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i) - if i>0 - if !(a.touched[i]) - a.touched[i]=true - end - if !isa(v,Nothing) - a.values[i]=c(v,a.values[i]) - end - end - nothing -end -@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j) - @notimplemented -end -@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i) - if !isa(v,Nothing) - for (ve,ie) in zip(v,i) - Arrays.add_entry!(c,a,ve,ie) - end - else - for ie in i - Arrays.add_entry!(c,a,nothing,ie) - end - end - nothing -end - - -function _setup_touched_and_allocations_arrays(values) - touched = map(values) do values - fill!(Vector{Bool}(undef,length(values)),false) - end - allocations = map(values,touched) do values,touched - ArrayAllocationTrackTouchedAndValues(touched,values) - end - touched, allocations -end - -function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows}, - b::PVectorCounter{<:SubAssembledRows}) - A = nz_allocation(a) - dofs = b.test_gids - values = map(nz_allocation,b.counters) - touched,allocations=_setup_touched_and_allocations_arrays(values) - B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs) - return A,B -end - -function Arrays.nz_allocation(a::PVectorCounter{<:SubAssembledRows}) - dofs = a.test_gids - values = map(nz_allocation,a.counters) - touched,allocations=_setup_touched_and_allocations_arrays(values) - return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs) -end - -function local_views(a::PVectorAllocationTrackTouchedAndValues) - a.allocations -end - -function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues) - - # Find the ghost rows - allocations = local_views(a.allocations) - indices = partition(a.test_gids) - I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices - dofs_lids_touched = findall(allocation.touched) - loc_to_gho = local_to_ghost(indices) - n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched) - I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids) - cur = 1 - for lid in dofs_lids_touched - dof_lid = loc_to_gho[lid] - if dof_lid != 0 - I_ghost_lids[cur] = dof_lid - cur = cur+1 - end - end - I_ghost_lids - end - - ghost_to_global, ghost_to_owner = map( - find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays - - ngids = length(a.test_gids) - _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner) -end - -function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues) - rows = _setup_prange_from_pvector_allocation(a) - b = _rhs_callback(a,rows) - t2 = assemble!(b) - # Wait the transfer to finish - if t2 !== nothing - wait(t2) - end - return b -end - -# Common Assembly Utilities -function first_gdof_from_ids(ids) - if own_length(ids) == 0 - return 1 - end - lid_to_gid = local_to_global(ids) - owned_to_lid = own_to_local(ids) - return Int(lid_to_gid[first(owned_to_lid)]) -end - -function find_gid_and_owner(ighost_to_jghost,jindices) - jghost_to_local = ghost_to_local(jindices) - jlocal_to_global = local_to_global(jindices) - jlocal_to_owner = local_to_owner(jindices) - ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost)) - - ighost_to_global = jlocal_to_global[ighost_to_jlocal] - ighost_to_owner = jlocal_to_owner[ighost_to_jlocal] - return ighost_to_global, ighost_to_owner -end - -# The given ids are assumed to be a sub-set of the lids -function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer}) - glo_to_loc = global_to_local(a) - loc_to_gho = local_to_ghost(a) - - # First pass: Allocate - i = 0 - ghost_lids_touched = fill(false,ghost_length(a)) - for gid in gids - lid = glo_to_loc[gid] - ghost_lid = loc_to_gho[lid] - if ghost_lid > 0 && !ghost_lids_touched[ghost_lid] - ghost_lids_touched[ghost_lid] = true - i += 1 - end - end - gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i) - - # Second pass: fill - i = 1 - fill!(ghost_lids_touched,false) - for gid in gids - lid = glo_to_loc[gid] - ghost_lid = loc_to_gho[lid] - if ghost_lid > 0 && !ghost_lids_touched[ghost_lid] - ghost_lids_touched[ghost_lid] = true - gids_ghost_lid_to_ghost_lid[i] = ghost_lid - i += 1 - end - end - - return gids_ghost_lid_to_ghost_lid -end - -# Find the neighbours of partition1 trying -# to use those in partition2 as a hint -function _find_neighbours!(partition1, partition2) - partition2_snd, partition2_rcv = assembly_neighbors(partition2) - partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv) - return assembly_neighbors(partition1; neighbors=partition2_graph) -end - -# to_global! & to_local! analogs, needed for dispatching in block assembly - -function to_local_indices!(I,ids::PRange;kwargs...) - map(to_local!,I,partition(ids)) -end - -function to_global_indices!(I,ids::PRange;kwargs...) - map(to_global!,I,partition(ids)) -end - -function get_gid_owners(I,ids::PRange;kwargs...) - map(I,partition(ids)) do I, indices - glo_to_loc = global_to_local(indices) - loc_to_own = local_to_owner(indices) - map(x->loc_to_own[glo_to_loc[x]], I) - end -end - -for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners] - @eval begin - function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...) - map($f,I,ids) - end - - function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows) - @check ax ∈ [:rows,:cols] - block_ids = CartesianIndices(I) - map(block_ids) do id - i = id[1]; j = id[2]; - if ax == :rows - $f(I[i,j],ids[i]) - else - $f(I[i,j],ids[j]) - end - end - end - end -end - -# _setup_matrix : local matrices + global PRanges -> Global matrix - -function _setup_matrix(I,J,V,rows::PRange,cols::PRange) - assembled_rows = map(PArrays.remove_ghost,partition(rows)) - psparse( - I,J,V,assembled_rows,partition(cols); - split_format=true, - assembled=true, - assemble=true, - indices=:local, - reuse=false, - ) |> fetch -end - -function _setup_matrix(I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange}) - block_ids = CartesianIndices(I) - block_mats = map(block_ids) do id - i = id[1]; j = id[2]; - _setup_matrix(I[i,j],J[i,j],V[i,j],rows[i],cols[j]) - end - return mortar(block_mats) -end - -# _assemble_coo! : local coo triplets + global PRange -> Global coo values - -function _assemble_coo!(I,J,V,rows::PRange;owners=nothing) - if isa(owners,Nothing) - PArrays.assemble_coo!(I,J,V,partition(rows)) - else - assemble_coo_with_column_owner!(I,J,V,partition(rows),owners) - end -end - -function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing) - block_ids = CartesianIndices(I) - map(block_ids) do id - i = id[1]; j = id[2]; - if isa(owners,Nothing) - _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i]) - else - _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j]) - end - end -end - -function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown) - """ - Returns three JaggedArrays with the coo triplets - to be sent to the corresponding owner parts in parts_snd - """ - function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner) - global_to_local_row = global_to_local(row_lids) - local_row_to_owner = local_to_owner(row_lids) - owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) )) - ptrs = zeros(Int32,length(parts_snd)+1) - k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner - for k in 1:length(k_gi) - gi = k_gi[k] - li = global_to_local_row[gi] - owner = local_row_to_owner[li] - if owner != part - ptrs[owner_to_i[owner]+1] +=1 - end - end - PArrays.length_to_ptrs!(ptrs) - gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1) - gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1) - jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1) - v_snd_data = zeros(eltype(k_v),ptrs[end]-1) - for k in 1:length(k_gi) - gi = k_gi[k] - li = global_to_local_row[gi] - owner = local_row_to_owner[li] - if owner != part - gj = k_gj[k] - v = k_v[k] - p = ptrs[owner_to_i[owner]] - gi_snd_data[p] = gi - gj_snd_data[p] = gj - jo_snd_data[p] = k_jo[k] - v_snd_data[p] = v - k_v[k] = zero(v) - ptrs[owner_to_i[owner]] += 1 - end - end - PArrays.rewind_ptrs!(ptrs) - gi_snd = JaggedArray(gi_snd_data,ptrs) - gj_snd = JaggedArray(gj_snd_data,ptrs) - jo_snd = JaggedArray(jo_snd_data,ptrs) - v_snd = JaggedArray(v_snd_data,ptrs) - gi_snd, gj_snd, jo_snd, v_snd - end - """ - Pushes to coo_entries_with_column_owner the tuples - gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes - """ - function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv) - k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner - current_n = length(k_gi) - new_n = current_n + length(gi_rcv.data) - resize!(k_gi,new_n) - resize!(k_gj,new_n) - resize!(k_jo,new_n) - resize!(k_v,new_n) - for p in 1:length(gi_rcv.data) - k_gi[current_n+p] = gi_rcv.data[p] - k_gj[current_n+p] = gj_rcv.data[p] - k_jo[current_n+p] = jo_rcv.data[p] - k_v[current_n+p] = v_rcv.data[p] - end - end - part = linear_indices(row_partition) - parts_snd, parts_rcv = assembly_neighbors(row_partition) - coo_entries_with_column_owner = map(tuple,I,J,Jown,V) - gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays - graph = ExchangeGraph(parts_snd,parts_rcv) - t1 = exchange(gi_snd,graph) - t2 = exchange(gj_snd,graph) - t3 = exchange(jo_snd,graph) - t4 = exchange(v_snd,graph) - @async begin - gi_rcv = fetch(t1) - gj_rcv = fetch(t2) - jo_rcv = fetch(t3) - v_rcv = fetch(t4) - map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv) - I,J,Jown,V - end -end - -# dofs_gids_prange can be either test_gids or trial_gids -# In the former case, gids is a vector of global test dof identifiers, while in the -# latter, a vector of global trial dof identifiers -function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...) - if !ghost - _setup_prange_without_ghosts(dofs_gids_prange) - elseif isa(owners,Nothing) - _setup_prange_with_ghosts(dofs_gids_prange,gids) - else - _setup_prange_with_ghosts(dofs_gids_prange,gids,owners) - end -end - -function _setup_prange( - dofs_gids_prange::AbstractVector{<:PRange}, - gids::AbstractMatrix; - ax=:rows,ghost=true,owners=nothing -) - @check ax ∈ (:rows,:cols) - block_ids = LinearIndices(dofs_gids_prange) - pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x)) - - gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange - gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id] - gids_union = pvcat(gids_slice) - - owners_union = nothing - if !isnothing(owners) - owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id] - owners_union = pvcat(owners_slice) - end - - return gids_union, owners_union - end |> tuple_of_arrays - - return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union) -end - -# Create PRange for the rows of the linear system -# without local ghost dofs as per required in the -# FullyAssembledRows() parallel assembly strategy -function _setup_prange_without_ghosts(dofs_gids_prange::PRange) - ngdofs = length(dofs_gids_prange) - indices = map(partition(dofs_gids_prange)) do dofs_indices - owner = part_id(dofs_indices) - own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices)) - ghost_indices = GhostIndices(ngdofs,Int64[],Int32[]) - OwnAndGhostIndices(own_indices,ghost_indices) - end - return PRange(indices) -end - -# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition -# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system. -# The precondition required for the consistency of any parallel assembly process in GridapDistributed -# is that each processor can determine locally with a single layer of ghost cells the global indices and associated -# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities -# (i.e., either cells or faces). -function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids) - ngdofs = length(dofs_gids_prange) - dofs_gids_partition = partition(dofs_gids_prange) - - # Selected ghost ids -> dof PRange ghost ids - gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids) - - # Selected ghost ids -> [global dof ids, owner processor ids] - gids_ghost_to_global, gids_ghost_to_owner = map( - find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition) |> tuple_of_arrays - - return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner) -end - -# Here we cannot assume that the sparse communication graph underlying -# trial_dofs_gids_partition is a superset of the one underlying indices. -# Here we chould check whether it is included and call _find_neighbours!() -# if this is the case. At present, we are not taking advantage of this, -# but let the parallel scalable algorithm to compute the reciprocal to do the job. -function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners) - ngdofs = length(dofs_gids_prange) - dofs_gids_partition = partition(dofs_gids_prange) - - # Selected ghost ids -> [global dof ids, owner processor ids] - gids_ghost_to_global, gids_ghost_to_owner = map( - gids,owners,dofs_gids_partition) do gids, owners, indices - ghost_touched = Dict{Int,Bool}() - ghost_to_global = Int64[] - ghost_to_owner = Int64[] - me = part_id(indices) - for (j,jo) in zip(gids,owners) - if jo != me - if !haskey(ghost_touched,j) - push!(ghost_to_global,j) - push!(ghost_to_owner,jo) - ghost_touched[j] = true - end - end - end - ghost_to_global, ghost_to_owner - end |> tuple_of_arrays - - return _setup_prange_impl_(ngdofs, - dofs_gids_partition, - gids_ghost_to_global, - gids_ghost_to_owner; - discover_neighbours=false) -end - -function _setup_prange_impl_(ngdofs, - dofs_gids_partition, - gids_ghost_to_global, - gids_ghost_to_owner; - discover_neighbours=true) - indices = map(dofs_gids_partition, - gids_ghost_to_global, - gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner - owner = part_id(dofs_indices) - own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices)) - ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner) - OwnAndGhostIndices(own_indices,ghost_indices) - end - if discover_neighbours - _find_neighbours!(indices,dofs_gids_partition) - end - return PRange(indices) -end diff --git a/src/Assembly.jl b/src/Assembly.jl new file mode 100644 index 00000000..25e2411f --- /dev/null +++ b/src/Assembly.jl @@ -0,0 +1,199 @@ + +# Parallel assembly strategies + +struct Assembled <: FESpaces.AssemblyStrategy end +struct SubAssembled <: FESpaces.AssemblyStrategy end +struct LocallyAssembled <: FESpaces.AssemblyStrategy end + +# PSparseMatrix and PVector builders + +struct PSparseMatrixBuilder{T,B} + local_matrix_type::Type{T} + strategy::B +end + +function Algebra.get_array_type(::PSparseMatrixBuilder{Tv}) where Tv + T = eltype(Tv) + return PSparseMatrix{T} +end + +struct PVectorBuilder{T,B} + local_vector_type::Type{T} + strategy::B +end + +function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv + T = eltype(Tv) + return PVector{T} +end + +# Distributed counters and allocators + +""" + DistributedCounter{S,T,N} <: GridapType + +Distributed N-dimensional counter, with local counters of type T. +Follows assembly strategy S. +""" +struct DistributedCounter{S,T,N,A,B} <: GridapType + counters :: A + axes :: B + strategy :: S + function DistributedCounter( + counters :: AbstractArray{T}, + axes :: NTuple{N,<:PRange}, + strategy :: Algebra.AssemblyStrategy + ) where {T,N} + A, B, S = typeof(counters), typeof(axes), typeof(strategy) + new{S,T,N,A,B}(counters,axes,strategy) + end +end + +Base.axes(a::DistributedCounter) = a.axes +local_views(a::DistributedCounter) = a.counters + +const PVectorCounter{S,T} = DistributedCounter{S,T<:Algebra.ArrayCounter,1} +Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop() + +const PSparseMatrixCounter{S,T} = DistributedCounter{S,T<:Algebra.CounterCOO,2} +Algebra.LoopStyle(::Type{<:PSparseMatrixCounter}) = Loop() + +""" + DistributedAllocation{S,T,N} <: GridapType + +Distributed N-dimensional allocator, with local allocators of type T. +Follows assembly strategy S. +""" +struct DistributedAllocation{S,T,N,A,B} <: GridapType + allocs :: A + axes :: B + strategy :: S + function DistributedAllocation( + allocs :: AbstractArray{T}, + axes :: NTuple{N,<:PRange}, + strategy :: Algebra.AssemblyStrategy + ) where {T,N} + A, B, S = typeof(allocs), typeof(axes), typeof(strategy) + new{S,T,N,A,B}(allocs,axes,strategy) + end +end + +Base.axes(a::DistributedAllocation) = a.axes +local_views(a::DistributedAllocation) = a.allocs + +const PVectorAllocation{S,T} = DistributedAllocation{S,T,1} +const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2} + +# PSparseMatrix assembly chain +# +# 1 - nz_counter(PSparseMatrixBuilder) -> PSparseMatrixCounter +# 2 - nz_allocation(PSparseMatrixCounter) -> PSparseMatrixAllocation +# 3 - create_from_nz(PSparseMatrixAllocation) -> PSparseMatrix + +function Algebra.nz_counter(builder::PSparseMatrixBuilder,axs::NTuple{2,<:PRange}) + rows, cols = axs # test ids, trial ids + counters = map(partition(rows),partition(cols)) do rows,cols + axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols))) + Algebra.CounterCOO{A}(axs) + end + DistributedCounter(builder.par_strategy,counters,rows,cols) +end + +function Algebra.nz_allocation(a::PSparseMatrixCounter) + allocs = map(nz_allocation,local_views(a)) + DistributedAllocation(allocs,a.axes,a.strategy) +end + +# PVector assembly chain: +# +# 1 - nz_counter(PVectorBuilder) -> PVectorCounter +# 2 - nz_allocation(PVectorCounter) -> PVectorAllocation +# 3 - create_from_nz(PVectorAllocation) -> PVector + +function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::Tuple{<:PRange}) where VT + rows, = axs + counters = map(partition(rows)) do rows + axs = (Base.OneTo(local_length(rows)),) + nz_counter(ArrayBuilder(VT),axs) + end + DistributedCounter(counters,axs,builder.strategy) +end + +function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled}) + allocs = map(nz_allocation,local_views(a)) + DistributedAllocation(allocs,a.axes,a.strategy) +end + +function Arrays.nz_allocation(a::PVectorCounter{<:Assembled}) + values = map(nz_allocation,local_views(a)) + allocs = map(TrackedArrayAllocation,values) + return DistributedAllocation(allocs,a.axes,a.strategy) +end + +function Algebra.create_from_nz(a::PVector) + assemble!(a) |> wait + return a +end + +function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVector}) + # This point MUST NEVER be reached. If reached there is an inconsistency + # in the parallel code in charge of vector assembly + @assert false +end + +function Algebra.create_from_nz(a::PVectorAllocation{<:LocallyAssembled,<:AbstractVector}) + rows = _setup_prange_without_ghosts(axes(a,1)) + _rhs_callback(a,rows) +end + +function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S + rows = _setup_prange_from_pvector_allocation(a) + b = _rhs_callback(a,rows) + t2 = assemble!(b) + if t2 !== nothing + wait(t2) + end + return b +end + +# PSystem assembly chain: +# +# When assembling a full system (matrix + vector), it is more efficient to +# overlap communications the assembly of the matrix and the vector. +# Not only it is faster, but also necessary to ensure identical ghost indices +# in both the matrix and vector rows. +# This is done by using the following specializations chain: + +function Arrays.nz_allocation( + a::PSparseMatrixCounter{<:Assembled}, + b::PVectorCounter{<:Assembled} +) + A = nz_allocation(a) # PSparseMatrixAllocation + B = nz_allocation(b) # PVectorAllocation{<:Assembled,<:TrackedArrayAllocation} + return A, B +end + +function Algebra.create_from_nz( + a::PSparseMatrixAllocation{<:LocallyAssembled}, + b::PVectorAllocation{<:LocallyAssembled,<:AbstractVector} +) + function callback(rows) + _rhs_callback(b,rows) + end + A, B = _fa_create_from_nz_with_callback(callback,a) + return A, B +end + +function Algebra.create_from_nz( + a::PSparseMatrixAllocation{<:Assembled}, + b::PVectorAllocation{<:Assembled,<:TrackedArrayAllocation} +) + function callback(rows) + _rhs_callback(b,rows) + end + function async_callback(b) + assemble!(b) + end + A, B = _sa_create_from_nz_with_callback(callback,async_callback,a,b) + return A, B +end diff --git a/src/DistributedUtils.jl b/src/DistributedUtils.jl deleted file mode 100644 index 0b34a0a1..00000000 --- a/src/DistributedUtils.jl +++ /dev/null @@ -1,44 +0,0 @@ - -function permuted_variable_partition( - n_own::AbstractArray{<:Integer}, - gids::AbstractArray{<:AbstractArray{<:Integer}}, - owners::AbstractArray{<:AbstractArray{<:Integer}}; - n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))), - start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own))) -) - ranks = linear_indices(n_own) - np = length(ranks) - map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners - n_local = length(gids) - n_ghost = n_local - n_own - perm = fill(zero(Int32),n_local) - ghost_gids = fill(zero(Int),n_ghost) - ghost_owners = fill(zero(Int32),n_ghost) - - n_ghost = 0 - for (lid,(gid,owner)) in enumerate(zip(gids,owners)) - if owner == rank - perm[lid] = gid-start+1 - else - n_ghost += 1 - ghost_gids[n_ghost] = gid - ghost_owners[n_ghost] = owner - perm[lid] = n_own + n_ghost - end - end - @assert n_ghost == n_local - n_own - - ghost = GhostIndices(n_global,ghost_gids,ghost_owners) - dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize( - CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost - ) - permute_indices(dof_ids,perm) - end -end - -function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T - ptrs = Vector{Int32}(undef,length(vv)+1) - Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv) - Arrays.length_to_ptrs!(ptrs) - return ptrs -end diff --git a/src/Geometry.jl b/src/Geometry.jl index 4c788242..0f814b3c 100644 --- a/src/Geometry.jl +++ b/src/Geometry.jl @@ -582,7 +582,7 @@ function filter_cells_when_needed( end function filter_cells_when_needed( - portion::FullyAssembledRows, + portion::LocallyAssembled, cell_gids::AbstractLocalIndices, trian::Triangulation) @@ -590,7 +590,7 @@ function filter_cells_when_needed( end function filter_cells_when_needed( - portion::SubAssembledRows, + portion::Assembled, cell_gids::AbstractLocalIndices, trian::Triangulation) diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl index 7edf2a48..b55ddfe8 100644 --- a/src/GridapDistributed.jl +++ b/src/GridapDistributed.jl @@ -42,12 +42,15 @@ export with_ghost, no_ghost export redistribute -include("DistributedUtils.jl") +include("GridapExtras.jl") +include("PArraysExtras.jl") include("BlockPartitionedArrays.jl") include("Algebra.jl") +include("Assembly.jl") + include("Geometry.jl") include("CellData.jl") diff --git a/src/GridapExtras.jl b/src/GridapExtras.jl new file mode 100644 index 00000000..df5b1186 --- /dev/null +++ b/src/GridapExtras.jl @@ -0,0 +1,55 @@ + +# Extensions to Gridap/Arrays/Tables.jl + +function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T + ptrs = Vector{Int32}(undef,length(vv)+1) + Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv) + Arrays.length_to_ptrs!(ptrs) + return ptrs +end + +# New type of Vector allocation + +""" + TrackedArrayAllocation{T} <: GridapType + +Array that keeps track of which entries have been touched. For assembly purposes. +""" +struct TrackedArrayAllocation{T} + values :: Vector{T} + touched :: Vector{Bool} +end + +function TrackedArrayAllocation(values) + touched = fill(false,length(values)) + TrackedArrayAllocation(values,touched) +end + +Algebra.LoopStyle(::Type{<:TrackedArrayAllocation}) = Algebra.Loop() +Algebra.create_from_nz(a::TrackedArrayAllocation) = a.values + +@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i) + if i > 0 + a.touched[i] = true + end + nothing +end +@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v,i) + if i > 0 + a.touched[i] = true + a.values[i] = combine(v,a.values[i]) + end + nothing +end +@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i) + for ie in i + Arrays.add_entry!(combine,a,nothing,ie) + end + nothing +end +@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v,i) + for (ve,ie) in zip(v,i) + Arrays.add_entry!(combine,a,ve,ie) + end + nothing +end diff --git a/src/MultiField.jl b/src/MultiField.jl index 689a0621..75e55d83 100644 --- a/src/MultiField.jl +++ b/src/MultiField.jl @@ -453,7 +453,7 @@ function FESpaces.SparseMatrixAssembler( local_vec_type, trial::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle{NB,SB,P}}, test::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle{NB,SB,P}}, - par_strategy=SubAssembledRows()) where {NB,SB,P} + par_strategy=Assembled()) where {NB,SB,P} block_idx = CartesianIndices((NB,NB)) block_rows = blocks(test.gids) diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl new file mode 100644 index 00000000..f4a5ae2e --- /dev/null +++ b/src/PArraysExtras.jl @@ -0,0 +1,210 @@ + +""" + permuted_variable_partition(n_own,gids,owners; n_global, start) + +Create indices which are a permuted version of a variable_partition. +The advantage of this wrt the `LocalIndices` type, is that we can compute +dof owners with minimal communications, since we only need the size of the blocks. + +NOTE: This is the type for our FESpace dof_ids. +""" +function permuted_variable_partition( + n_own::AbstractArray{<:Integer}, + gids::AbstractArray{<:AbstractArray{<:Integer}}, + owners::AbstractArray{<:AbstractArray{<:Integer}}; + n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))), + start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own))) +) + ranks = linear_indices(n_own) + np = length(ranks) + map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners + n_local = length(gids) + n_ghost = n_local - n_own + perm = fill(zero(Int32),n_local) + ghost_gids = fill(zero(Int),n_ghost) + ghost_owners = fill(zero(Int32),n_ghost) + + n_ghost = 0 + for (lid,(gid,owner)) in enumerate(zip(gids,owners)) + if owner == rank + perm[lid] = gid-start+1 + else + n_ghost += 1 + ghost_gids[n_ghost] = gid + ghost_owners[n_ghost] = owner + perm[lid] = n_own + n_ghost + end + end + @assert n_ghost == n_local - n_own + + ghost = GhostIndices(n_global,ghost_gids,ghost_owners) + dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize( + CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost + ) + permute_indices(dof_ids,perm) + end +end + +# Linear algebra + +function LinearAlgebra.axpy!(α,x::PVector,y::PVector) + @check partition(axes(x,1)) === partition(axes(y,1)) + map(partition(x),partition(y)) do x,y + LinearAlgebra.axpy!(α,x,y) + end + consistent!(y) |> wait + return y +end + +function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector) + map(blocks(x),blocks(y)) do x,y + LinearAlgebra.axpy!(α,x,y) + end + return y +end + +function Algebra.axpy_entries!( + α::Number, A::PSparseMatrix, B::PSparseMatrix; + check::Bool=true +) +# We should definitely check here that the index partitions are the same. +# However: Because the different matrices are assembled separately, the objects are not the +# same (i.e can't use ===). Checking the index partitions would then be costly... + @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1)))) + @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2)))) + map(partition(A),partition(B)) do A, B + Algebra.axpy_entries!(α,A,B;check) + end + return B +end + +function Algebra.axpy_entries!( + α::Number, A::BlockPMatrix, B::BlockPMatrix; + check::Bool=true +) + map(blocks(A),blocks(B)) do A, B + Algebra.axpy_entries!(α,A,B;check) + end + return B +end + +# Array of PArrays -> PArray of Arrays +# TODO: I think this is now implemented in PartitionedArrays.jl (check) +function to_parray_of_arrays(a::AbstractArray{<:MPIArray}) + indices = linear_indices(first(a)) + map(indices) do i + map(a) do aj + PartitionedArrays.getany(aj) + end + end +end + +function to_parray_of_arrays(a::AbstractArray{<:DebugArray}) + indices = linear_indices(first(a)) + map(indices) do i + map(a) do aj + aj.items[i] + end + end +end + +# This type is required because MPIArray from PArrays +# cannot be instantiated with a NULL communicator +struct MPIVoidVector{T} <: AbstractVector{T} + comm::MPI.Comm + function MPIVoidVector(::Type{T}) where {T} + new{T}(MPI.COMM_NULL) + end +end + +Base.size(a::MPIVoidVector) = (0,) +Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear() +function Base.getindex(a::MPIVoidVector,i::Int) + error("Indexing of MPIVoidVector not possible.") +end +function Base.setindex!(a::MPIVoidVector,v,i::Int) + error("Indexing of MPIVoidVector not possible.") +end +function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector) + println(io,"MPIVoidVector") +end + +# Communication extras, subpartitioning extras + +function num_parts(comm::MPI.Comm) + if comm != MPI.COMM_NULL + nparts = MPI.Comm_size(comm) + else + nparts = -1 + end + nparts +end +@inline num_parts(comm::MPIArray) = num_parts(comm.comm) +@inline num_parts(comm::DebugArray) = length(comm.items) +@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm) + +function get_part_id(comm::MPI.Comm) + if comm != MPI.COMM_NULL + id = MPI.Comm_rank(comm)+1 + else + id = -1 + end + id +end +@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm) +@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm) + +""" + i_am_in(comm::MPIArray) + i_am_in(comm::DebugArray) + + Returns `true` if the processor is part of the subcommunicator `comm`. +""" +function i_am_in(comm::MPI.Comm) + get_part_id(comm) >=0 +end +@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm) +@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm) +@inline i_am_in(comm::DebugArray) = true + +function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing) + x_new = map(new_parts) do p + if isa(x,MPIArray) + PartitionedArrays.getany(x) + elseif isa(x,DebugArray) && (p <= length(x.items)) + x.items[p] + else + default + end + end + return x_new +end + +function generate_subparts(parts::MPIArray,new_comm_size) + root_comm = parts.comm + root_size = MPI.Comm_size(root_comm) + rank = MPI.Comm_rank(root_comm) + + @static if isdefined(MPI,:MPI_UNDEFINED) + mpi_undefined = MPI.MPI_UNDEFINED[] + else + mpi_undefined = MPI.API.MPI_UNDEFINED[] + end + + if root_size == new_comm_size + return parts + else + if rank < new_comm_size + comm = MPI.Comm_split(root_comm,0,0) + return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false) + else + comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined) + return MPIVoidVector(eltype(parts)) + end + end +end + +function generate_subparts(parts::DebugArray,new_comm_size) + DebugArray(LinearIndices((new_comm_size,))) +end + diff --git a/src/_Algebra_old.jl b/src/_Algebra_old.jl new file mode 100644 index 00000000..e5aefd6a --- /dev/null +++ b/src/_Algebra_old.jl @@ -0,0 +1,1238 @@ + +# Vector allocation + +function Algebra.allocate_vector(::Type{<:PVector{V}},ids::PRange) where {V} + PVector{V}(undef,partition(ids)) +end + +function Algebra.allocate_vector(::Type{<:BlockPVector{V}},ids::BlockPRange) where {V} + BlockPVector{V}(undef,ids) +end + +function Algebra.allocate_in_range(matrix::PSparseMatrix) + V = Vector{eltype(matrix)} + allocate_in_range(PVector{V},matrix) +end + +function Algebra.allocate_in_domain(matrix::PSparseMatrix) + V = Vector{eltype(matrix)} + allocate_in_domain(PVector{V},matrix) +end + +function Algebra.allocate_in_range(matrix::BlockPMatrix) + V = Vector{eltype(matrix)} + allocate_in_range(BlockPVector{V},matrix) +end + +function Algebra.allocate_in_domain(matrix::BlockPMatrix) + V = Vector{eltype(matrix)} + allocate_in_domain(BlockPVector{V},matrix) +end + +# PartitionedArrays extras + +function LinearAlgebra.axpy!(α,x::PVector,y::PVector) + @check partition(axes(x,1)) === partition(axes(y,1)) + map(partition(x),partition(y)) do x,y + LinearAlgebra.axpy!(α,x,y) + end + consistent!(y) |> wait + return y +end + +function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector) + map(blocks(x),blocks(y)) do x,y + LinearAlgebra.axpy!(α,x,y) + end + return y +end + +function Algebra.axpy_entries!( + α::Number, A::PSparseMatrix, B::PSparseMatrix; + check::Bool=true +) +# We should definitely check here that the index partitions are the same. +# However: Because the different matrices are assembled separately, the objects are not the +# same (i.e can't use ===). Checking the index partitions would then be costly... + @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1)))) + @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2)))) + map(partition(A),partition(B)) do A, B + Algebra.axpy_entries!(α,A,B;check) + end + return B +end + +function Algebra.axpy_entries!( + α::Number, A::BlockPMatrix, B::BlockPMatrix; + check::Bool=true +) + map(blocks(A),blocks(B)) do A, B + Algebra.axpy_entries!(α,A,B;check) + end + return B +end + +# TODO: Can we remove this? +# This might go to Gridap in the future. We keep it here for the moment. +function change_axes(a::Algebra.ArrayCounter,axes) + @notimplemented +end + +# This might go to Gridap in the future. We keep it here for the moment. +function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A} + b = Algebra.CounterCOO{T}(axes) + b.nnz = a.nnz + b +end + +# This might go to Gridap in the future. We keep it here for the moment. +function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A} + counter = change_axes(a.counter,axes) + Algebra.AllocationCOO(counter,a.I,a.J,a.V) +end + +# Array of PArrays -> PArray of Arrays +function to_parray_of_arrays(a::AbstractArray{<:MPIArray}) + indices = linear_indices(first(a)) + map(indices) do i + map(a) do aj + PartitionedArrays.getany(aj) + end + end +end + +function to_parray_of_arrays(a::AbstractArray{<:DebugArray}) + indices = linear_indices(first(a)) + map(indices) do i + map(a) do aj + aj.items[i] + end + end +end + +# This type is required because MPIArray from PArrays +# cannot be instantiated with a NULL communicator +struct MPIVoidVector{T} <: AbstractVector{T} + comm::MPI.Comm + function MPIVoidVector(::Type{T}) where {T} + new{T}(MPI.COMM_NULL) + end +end + +Base.size(a::MPIVoidVector) = (0,) +Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear() +function Base.getindex(a::MPIVoidVector,i::Int) + error("Indexing of MPIVoidVector not possible.") +end +function Base.setindex!(a::MPIVoidVector,v,i::Int) + error("Indexing of MPIVoidVector not possible.") +end +function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector) + println(io,"MPIVoidVector") +end + +function num_parts(comm::MPI.Comm) + if comm != MPI.COMM_NULL + nparts = MPI.Comm_size(comm) + else + nparts = -1 + end + nparts +end +@inline num_parts(comm::MPIArray) = num_parts(comm.comm) +@inline num_parts(comm::DebugArray) = length(comm.items) +@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm) + +function get_part_id(comm::MPI.Comm) + if comm != MPI.COMM_NULL + id = MPI.Comm_rank(comm)+1 + else + id = -1 + end + id +end +@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm) +@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm) + +""" + i_am_in(comm::MPIArray) + i_am_in(comm::DebugArray) + + Returns `true` if the processor is part of the subcommunicator `comm`. +""" +function i_am_in(comm::MPI.Comm) + get_part_id(comm) >=0 +end +@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm) +@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm) +@inline i_am_in(comm::DebugArray) = true + +function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing) + x_new = map(new_parts) do p + if isa(x,MPIArray) + PartitionedArrays.getany(x) + elseif isa(x,DebugArray) && (p <= length(x.items)) + x.items[p] + else + default + end + end + return x_new +end + +function generate_subparts(parts::MPIArray,new_comm_size) + root_comm = parts.comm + root_size = MPI.Comm_size(root_comm) + rank = MPI.Comm_rank(root_comm) + + @static if isdefined(MPI,:MPI_UNDEFINED) + mpi_undefined = MPI.MPI_UNDEFINED[] + else + mpi_undefined = MPI.API.MPI_UNDEFINED[] + end + + if root_size == new_comm_size + return parts + else + if rank < new_comm_size + comm = MPI.Comm_split(root_comm,0,0) + return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false) + else + comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined) + return MPIVoidVector(eltype(parts)) + end + end +end + +function generate_subparts(parts::DebugArray,new_comm_size) + DebugArray(LinearIndices((new_comm_size,))) +end + +# local_views + +function local_views(a) + @abstractmethod +end + +function get_parts(a) + return linear_indices(local_views(a)) +end + +function local_views(a::AbstractVector,rows) + @notimplemented +end + +function local_views(a::AbstractMatrix,rows,cols) + @notimplemented +end + +local_views(a::AbstractArray) = a +local_views(a::PRange) = partition(a) +local_views(a::PVector) = partition(a) +local_views(a::PSparseMatrix) = partition(a) + +function local_views(a::BlockPRange) + map(blocks(a)) do a + local_views(a) + end |> to_parray_of_arrays +end + +function local_views(a::BlockPArray) + vals = map(blocks(a)) do a + local_views(a) + end |> to_parray_of_arrays + return map(mortar,vals) +end + +# change_ghost + +function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T + same_partition = (a.index_partition === partition(ids)) + a_new = same_partition ? a : change_ghost(T,a,ids) + if make_consistent && (!same_partition || !is_consistent) + consistent!(a_new) |> wait + end + return a_new +end + +function change_ghost(::Type{<:AbstractVector},a::PVector,ids::PRange) + a_new = similar(a,eltype(a),(ids,)) + # Equivalent to copy!(a_new,a) but does not check that owned indices match + map(copy!,own_values(a_new),own_values(a)) + return a_new +end + +function change_ghost(::Type{<:OwnAndGhostVectors},a::PVector,ids::PRange) + values = map(own_values(a),partition(ids)) do own_vals,ids + ghost_vals = fill(zero(eltype(a)),ghost_length(ids)) + perm = PartitionedArrays.local_permutation(ids) + OwnAndGhostVectors(own_vals,ghost_vals,perm) + end + return PVector(values,partition(ids)) +end + +function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_consistent=false) + vals = map(blocks(a),blocks(ids)) do a, ids + change_ghost(a,ids;is_consistent=is_consistent,make_consistent=make_consistent) + end + return BlockPVector(vals,ids) +end + +# TODO: Can we remove this? +# This function computes a mapping among the local identifiers of a and b +# for which the corresponding global identifiers are both in a and b. +# Note that the haskey check is necessary because in the general case +# there might be gids in b which are not present in a +function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices) + a_local_to_b_local = fill(Int32(-1),local_length(a)) + b_local_to_global = local_to_global(b) + a_global_to_local = global_to_local(a) + for blid in 1:local_length(b) + gid = b_local_to_global[blid] + if a_global_to_local[gid] != zero(eltype(a_global_to_local)) + alid = a_global_to_local[gid] + a_local_to_b_local[alid] = blid + end + end + a_local_to_b_local +end + +# TODO: Can we remove this? +# This type is required in order to be able to access the local portion +# of distributed sparse matrices and vectors using local indices from the +# distributed test and trial spaces +struct LocalView{T,N,A,B} <:AbstractArray{T,N} + plids_to_value::A + d_to_lid_to_plid::B + local_size::NTuple{N,Int} + function LocalView( + plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N} + A = typeof(plids_to_value) + B = typeof(d_to_lid_to_plid) + local_size = map(length,d_to_lid_to_plid) + new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size) + end +end + +Base.size(a::LocalView) = a.local_size +Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian() +function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N} + plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid) + if all(i->i>0,plids) + a.plids_to_value[plids...] + else + zero(T) + end +end +function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N} + plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid) + @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion" + a.plids_to_value[plids...] = v +end + +function _lid_to_plid(lid,lid_to_plid) + plid = lid_to_plid[lid] + plid +end + +function local_views(a::PVector,new_rows::PRange) + old_rows = axes(a,1) + if partition(old_rows) === partition(new_rows) + partition(a) + else + map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows + LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),)) + end + end +end + +function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange) + old_rows, old_cols = axes(a) + if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) ) + partition(a) + else + map(partition(a), + partition(old_rows),partition(old_cols), + partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols + rl2lmap = find_local_to_local_map(new_rows,old_rows) + cl2lmap = find_local_to_local_map(new_cols,old_cols) + LocalView(matrix_partition,(rl2lmap,cl2lmap)) + end + end +end + +function local_views(a::BlockPVector,new_rows::BlockPRange) + vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays + return map(mortar,vals) +end + +function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange) + vals = map(CartesianIndices(blocksize(a))) do I + local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I]) + end |> to_parray_of_arrays + return map(mortar,vals) +end + +# PSparseMatrix assembly + +struct Assembled <: FESpaces.AssemblyStrategy end +struct SubAssembled <: FESpaces.AssemblyStrategy end +struct LocallyAssembled <: FESpaces.AssemblyStrategy end + +struct PSparseMatrixBuilderCOO{T,B} + local_matrix_type::Type{T} + par_strategy::B +end + +function Algebra.nz_counter( + builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange} +) where A + rows, cols = axs # test ids, trial ids + counters = map(partition(rows),partition(cols)) do r,c + axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c))) + Algebra.CounterCOO{A}(axs) + end + DistributedCounterCOO(builder.par_strategy,counters,rows,cols) +end + +function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv + T = eltype(Tv) + return PSparseMatrix{T} +end + + +""" +""" +struct DistributedCounterCOO{A,B,C,D} <: GridapType + par_strategy::A + counters ::B + test_gids ::C + trial_gids ::D + function DistributedCounterCOO( + par_strategy, + counters::AbstractArray{<:Algebra.CounterCOO}, + test_gids::PRange, + trial_gids::PRange + ) + A = typeof(par_strategy) + B = typeof(counters) + C = typeof(test_gids) + D = typeof(trial_gids) + new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids) + end +end + +function local_views(a::DistributedCounterCOO) + return a.counters +end + +function local_views(a::DistributedCounterCOO,test_gids,trial_gids) + @check PArrays.matching_local_indices(test_gids,a.test_gids) + @check PArrays.matching_local_indices(trial_gids,a.trial_gids) + return a.counters +end + +function Algebra.nz_allocation(a::DistributedCounterCOO) + allocs = map(nz_allocation,a.counters) # Array{AllocationCOO} + DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids) +end + +struct DistributedAllocationCOO{A,B,C,D} <: GridapType + par_strategy::A + allocs::B + test_gids::C + trial_gids::D + function DistributedAllocationCOO( + par_strategy, + allocs::AbstractArray{<:Algebra.AllocationCOO}, + test_gids::PRange, + trial_gids::PRange) + A = typeof(par_strategy) + B = typeof(allocs) + C = typeof(test_gids) + D = typeof(trial_gids) + new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids) + end +end + +function change_axes( + a::DistributedAllocationCOO{A,B,<:PRange,<:PRange}, + axes::Tuple{<:PRange,<:PRange} +) where {A,B} + local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols + (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols))) + end + allocs = map(change_axes,a.allocs,local_axes) + DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2]) +end + +function change_axes( + a::MatrixBlock{<:DistributedAllocationCOO}, + axes::Tuple{<:Vector,<:Vector} +) + block_ids = CartesianIndices(a.array) + rows, cols = axes + array = map(block_ids) do I + change_axes(a[I],(rows[I[1]],cols[I[2]])) + end + return ArrayBlock(array,a.touched) +end + +function local_views(a::DistributedAllocationCOO) + a.allocs +end + +function local_views(a::DistributedAllocationCOO,test_gids,trial_gids) + @check test_gids === a.test_gids + @check trial_gids === a.trial_gids + a.allocs +end + +function local_views(a::MatrixBlock{<:DistributedAllocationCOO}) + array = map(local_views,a.array) |> to_parray_of_arrays + return map(ai -> ArrayBlock(ai,a.touched),array) +end + +function get_allocations(a::DistributedAllocationCOO) + I,J,V = map(local_views(a)) do alloc + alloc.I, alloc.J, alloc.V + end |> tuple_of_arrays + return I,J,V +end + +function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO}) + tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays + return tuple_of_array_of_parrays +end + +get_test_gids(a::DistributedAllocationCOO) = a.test_gids +get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids +get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_test_gids,diag(a.array)) +get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array)) + +function Algebra.create_from_nz(a::PSparseMatrix) + # For LocallyAssembled the underlying Exchanger should + # not have ghost layer making assemble! do nothing (TODO check) + assemble!(a) |> wait + a +end + +function Algebra.create_from_nz(a::DistributedAllocationCOO{<:LocallyAssembled}) + f(x) = nothing + A, = _fa_create_from_nz_with_callback(f,a) + return A +end + +function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:LocallyAssembled}}) + f(x) = nothing + A, = _fa_create_from_nz_with_callback(f,a) + return A +end + +function _fa_create_from_nz_with_callback(callback,a) + + # Recover some data + I,J,V = get_allocations(a) + test_gids = get_test_gids(a) + trial_gids = get_trial_gids(a) + + rows = _setup_prange(test_gids,I;ghost=false,ax=:rows) + b = callback(rows) + + # convert I and J to global dof ids + to_global_indices!(I,test_gids;ax=:rows) + to_global_indices!(J,trial_gids;ax=:cols) + + # Create the range for cols + cols = _setup_prange(trial_gids,J;ax=:cols) + + # Convert again I,J to local numeration + to_local_indices!(I,rows;ax=:rows) + to_local_indices!(J,cols;ax=:cols) + + A = _setup_matrix(I,J,V,rows,cols) + return A, b +end + +function Algebra.create_from_nz(a::DistributedAllocationCOO{<:Assembled}) + f(x) = nothing + A, = _sa_create_from_nz_with_callback(f,f,a,nothing) + return A +end + +function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:Assembled}}) + f(x) = nothing + A, = _sa_create_from_nz_with_callback(f,f,a,nothing) + return A +end + +function _sa_create_from_nz_with_callback(callback,async_callback,a,b) + # Recover some data + I,J,V = get_allocations(a) + test_gids = get_test_gids(a) + trial_gids = get_trial_gids(a) + + # convert I and J to global dof ids + to_global_indices!(I,test_gids;ax=:rows) + to_global_indices!(J,trial_gids;ax=:cols) + + # Create the Prange for the rows + rows = _setup_prange(test_gids,I;ax=:rows) + + # Move (I,J,V) triplets to the owner process of each row I. + # The triplets are accompanyed which Jo which is the process column owner + Jo = get_gid_owners(J,trial_gids;ax=:cols) + t = _assemble_coo!(I,J,V,rows;owners=Jo) + + # Here we can overlap computations + # This is a good place to overlap since + # sending the matrix rows is a lot of data + if !isa(b,Nothing) + bprange=_setup_prange_from_pvector_allocation(b) + b = callback(bprange) + end + + # Wait the transfer to finish + wait(t) + + # Create the Prange for the cols + cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo) + + # Overlap rhs communications with CSC compression + t2 = async_callback(b) + + # Convert again I,J to local numeration + to_local_indices!(I,rows;ax=:rows) + to_local_indices!(J,cols;ax=:cols) + + A = _setup_matrix(a,I,J,V,rows,cols) + + # Wait the transfer to finish + if !isa(t2,Nothing) + wait(t2) + end + + return A, b +end + + +# PVector assembly + +struct PVectorBuilder{T,B} + local_vector_type::Type{T} + par_strategy::B +end + +function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange}) + T = builder.local_vector_type + rows, = axs + counters = map(partition(rows)) do rows + axs = (Base.OneTo(local_length(rows)),) + nz_counter(ArrayBuilder(T),axs) + end + PVectorCounter(builder.par_strategy,counters,rows) +end + +function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv + T = eltype(Tv) + return PVector{T} +end + +struct PVectorCounter{A,B,C} + par_strategy::A + counters::B + test_gids::C +end + +Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop() + +function local_views(a::PVectorCounter) + a.counters +end + +function local_views(a::PVectorCounter,rows) + @check rows === a.test_gids + a.counters +end + +function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled}) + dofs = a.test_gids + values = map(nz_allocation,a.counters) + PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs) +end + +struct PVectorAllocationTrackOnlyValues{A,B,C} + par_strategy::A + values::B + test_gids::C +end + +function local_views(a::PVectorAllocationTrackOnlyValues) + a.values +end + +function local_views(a::PVectorAllocationTrackOnlyValues,rows) + @check rows === a.test_gids + a.values +end + +function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:LocallyAssembled}) + rows = _setup_prange_without_ghosts(a.test_gids) + _rhs_callback(a,rows) +end + +function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:Assembled}) + # This point MUST NEVER be reached. If reached there is an inconsistency + # in the parallel code in charge of vector assembly + @assert false +end + +function _rhs_callback(row_partitioned_vector_partition,rows) + # The ghost values in row_partitioned_vector_partition are + # aligned with the FESpace but not with the ghost values in the rows of A + b_fespace = PVector(row_partitioned_vector_partition.values, + partition(row_partitioned_vector_partition.test_gids)) + + # This one is aligned with the rows of A + b = similar(b_fespace,eltype(b_fespace),(rows,)) + + # First transfer owned values + b .= b_fespace + + # Now transfer ghost + function transfer_ghost(b,b_fespace,ids,ids_fespace) + num_ghosts_vec = ghost_length(ids) + gho_to_loc_vec = ghost_to_local(ids) + loc_to_glo_vec = local_to_global(ids) + gid_to_lid_fe = global_to_local(ids_fespace) + for ghost_lid_vec in 1:num_ghosts_vec + lid_vec = gho_to_loc_vec[ghost_lid_vec] + gid = loc_to_glo_vec[lid_vec] + lid_fespace = gid_to_lid_fe[gid] + b[lid_vec] = b_fespace[lid_fespace] + end + end + map( + transfer_ghost, + partition(b), + partition(b_fespace), + b.index_partition, + b_fespace.index_partition + ) + + return b +end + +function Algebra.create_from_nz(a::PVector) + assemble!(a) |> wait + return a +end + +function Algebra.create_from_nz( + a::DistributedAllocationCOO{<:LocallyAssembled}, + c_fespace::PVectorAllocationTrackOnlyValues{<:LocallyAssembled} +) + + function callback(rows) + _rhs_callback(c_fespace,rows) + end + + A,b = _fa_create_from_nz_with_callback(callback,a) + return A,b +end + +struct PVectorAllocationTrackTouchedAndValues{A,B,C} + allocations::A + values::B + test_gids::C +end + +function Algebra.create_from_nz( + a::DistributedAllocationCOO{<:Assembled}, + b::PVectorAllocationTrackTouchedAndValues +) + + function callback(rows) + _rhs_callback(b,rows) + end + + function async_callback(b) + # now we can assemble contributions + assemble!(b) + end + + A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b) + return A,b +end + +struct ArrayAllocationTrackTouchedAndValues{A} + touched::Vector{Bool} + values::A +end + +Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop() + +function local_views(a::PVectorAllocationTrackTouchedAndValues,rows) + @check PArrays.matching_local_indices(rows, a.test_gids) + a.allocations +end + +@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j) + @notimplemented +end +@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i) + if i > 0 + a.touched[i]=true + if !isnothing(v) + a.values[i]=c(v,a.values[i]) + end + end + nothing +end +@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j) + @notimplemented +end +@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i) + if !isa(v,Nothing) + for (ve,ie) in zip(v,i) + Arrays.add_entry!(c,a,ve,ie) + end + else + for ie in i + Arrays.add_entry!(c,a,nothing,ie) + end + end + nothing +end + + +function _setup_touched_and_allocations_arrays(values) + touched = map(values) do values + fill!(Vector{Bool}(undef,length(values)),false) + end + allocations = map(values,touched) do values,touched + ArrayAllocationTrackTouchedAndValues(touched,values) + end + touched, allocations +end + +function Arrays.nz_allocation( + a::DistributedCounterCOO{<:Assembled}, + b::PVectorCounter{<:Assembled} +) + A = nz_allocation(a) + dofs = b.test_gids + values = map(nz_allocation,b.counters) + touched, allocations = _setup_touched_and_allocations_arrays(values) + B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs) + return A, B +end + +function Arrays.nz_allocation(a::PVectorCounter{<:Assembled}) + dofs = a.test_gids + values = map(nz_allocation,a.counters) + touched,allocations=_setup_touched_and_allocations_arrays(values) + return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs) +end + +function local_views(a::PVectorAllocationTrackTouchedAndValues) + a.allocations +end + +function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues) + + # Find the ghost rows + allocations = local_views(a.allocations) + indices = partition(a.test_gids) + I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices + dofs_lids_touched = findall(allocation.touched) + loc_to_gho = local_to_ghost(indices) + n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched) + I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids) + cur = 1 + for lid in dofs_lids_touched + dof_lid = loc_to_gho[lid] + if dof_lid != 0 + I_ghost_lids[cur] = dof_lid + cur = cur+1 + end + end + I_ghost_lids + end + + ghost_to_global, ghost_to_owner = map( + find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays + + ngids = length(a.test_gids) + _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner) +end + +function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues) + rows = _setup_prange_from_pvector_allocation(a) + b = _rhs_callback(a,rows) + t2 = assemble!(b) + # Wait the transfer to finish + if t2 !== nothing + wait(t2) + end + return b +end + +# Common Assembly Utilities +function first_gdof_from_ids(ids) + if own_length(ids) == 0 + return 1 + end + lid_to_gid = local_to_global(ids) + owned_to_lid = own_to_local(ids) + return Int(lid_to_gid[first(owned_to_lid)]) +end + +function find_gid_and_owner(ighost_to_jghost,jindices) + jghost_to_local = ghost_to_local(jindices) + jlocal_to_global = local_to_global(jindices) + jlocal_to_owner = local_to_owner(jindices) + ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost)) + + ighost_to_global = jlocal_to_global[ighost_to_jlocal] + ighost_to_owner = jlocal_to_owner[ighost_to_jlocal] + return ighost_to_global, ighost_to_owner +end + +# The given ids are assumed to be a sub-set of the lids +function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer}) + glo_to_loc = global_to_local(a) + loc_to_gho = local_to_ghost(a) + + # First pass: Allocate + i = 0 + ghost_lids_touched = fill(false,ghost_length(a)) + for gid in gids + lid = glo_to_loc[gid] + ghost_lid = loc_to_gho[lid] + if ghost_lid > 0 && !ghost_lids_touched[ghost_lid] + ghost_lids_touched[ghost_lid] = true + i += 1 + end + end + gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i) + + # Second pass: fill + i = 1 + fill!(ghost_lids_touched,false) + for gid in gids + lid = glo_to_loc[gid] + ghost_lid = loc_to_gho[lid] + if ghost_lid > 0 && !ghost_lids_touched[ghost_lid] + ghost_lids_touched[ghost_lid] = true + gids_ghost_lid_to_ghost_lid[i] = ghost_lid + i += 1 + end + end + + return gids_ghost_lid_to_ghost_lid +end + +# Find the neighbours of partition1 trying +# to use those in partition2 as a hint +function _find_neighbours!(partition1, partition2) + partition2_snd, partition2_rcv = assembly_neighbors(partition2) + partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv) + return assembly_neighbors(partition1; neighbors=partition2_graph) +end + +# to_global! & to_local! analogs, needed for dispatching in block assembly + +function to_local_indices!(I,ids::PRange;kwargs...) + map(to_local!,I,partition(ids)) +end + +function to_global_indices!(I,ids::PRange;kwargs...) + map(to_global!,I,partition(ids)) +end + +function get_gid_owners(I,ids::PRange;kwargs...) + map(I,partition(ids)) do I, indices + glo_to_loc = global_to_local(indices) + loc_to_own = local_to_owner(indices) + map(x->loc_to_own[glo_to_loc[x]], I) + end +end + +for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners] + @eval begin + function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...) + map($f,I,ids) + end + + function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows) + @check ax ∈ [:rows,:cols] + block_ids = CartesianIndices(I) + map(block_ids) do id + i = id[1]; j = id[2]; + if ax == :rows + $f(I[i,j],ids[i]) + else + $f(I[i,j],ids[j]) + end + end + end + end +end + + +function _setup_matrix( + a,I,J,V,rows::PRange,cols::PRange +) + _rows = PRange(map(remove_ghost,partition(rows))) + asys = change_axes(a,(_rows,cols)) + values = map(create_from_nz,local_views(asys)) + PSparseMatrix(values,partition(_rows),partition(cols),true) +end + +function _setup_matrix(a,I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange}) + block_ids = CartesianIndices(I) + block_mats = map(block_ids) do id + i = id[1]; j = id[2]; + _setup_matrix(a[i,j],I[i,j],J[i,j],V[i,j],rows[i],cols[j]) + end + return mortar(block_mats) +end + +# _assemble_coo! : local coo triplets + global PRange -> Global coo values + +function _assemble_coo!(I,J,V,rows::PRange;owners=nothing) + if isa(owners,Nothing) + PArrays.assemble_coo!(I,J,V,partition(rows)) + else + assemble_coo_with_column_owner!(I,J,V,partition(rows),owners) + end +end + +function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing) + block_ids = CartesianIndices(I) + map(block_ids) do id + i = id[1]; j = id[2]; + if isa(owners,Nothing) + _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i]) + else + _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j]) + end + end +end + +function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown) + """ + Returns three JaggedArrays with the coo triplets + to be sent to the corresponding owner parts in parts_snd + """ + function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner) + global_to_local_row = global_to_local(row_lids) + local_row_to_owner = local_to_owner(row_lids) + owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) )) + ptrs = zeros(Int32,length(parts_snd)+1) + k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner + for k in 1:length(k_gi) + gi = k_gi[k] + li = global_to_local_row[gi] + owner = local_row_to_owner[li] + if owner != part + ptrs[owner_to_i[owner]+1] +=1 + end + end + PArrays.length_to_ptrs!(ptrs) + gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1) + gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1) + jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1) + v_snd_data = zeros(eltype(k_v),ptrs[end]-1) + for k in 1:length(k_gi) + gi = k_gi[k] + li = global_to_local_row[gi] + owner = local_row_to_owner[li] + if owner != part + gj = k_gj[k] + v = k_v[k] + p = ptrs[owner_to_i[owner]] + gi_snd_data[p] = gi + gj_snd_data[p] = gj + jo_snd_data[p] = k_jo[k] + v_snd_data[p] = v + k_v[k] = zero(v) + ptrs[owner_to_i[owner]] += 1 + end + end + PArrays.rewind_ptrs!(ptrs) + gi_snd = JaggedArray(gi_snd_data,ptrs) + gj_snd = JaggedArray(gj_snd_data,ptrs) + jo_snd = JaggedArray(jo_snd_data,ptrs) + v_snd = JaggedArray(v_snd_data,ptrs) + gi_snd, gj_snd, jo_snd, v_snd + end + """ + Pushes to coo_entries_with_column_owner the tuples + gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes + """ + function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv) + k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner + current_n = length(k_gi) + new_n = current_n + length(gi_rcv.data) + resize!(k_gi,new_n) + resize!(k_gj,new_n) + resize!(k_jo,new_n) + resize!(k_v,new_n) + for p in 1:length(gi_rcv.data) + k_gi[current_n+p] = gi_rcv.data[p] + k_gj[current_n+p] = gj_rcv.data[p] + k_jo[current_n+p] = jo_rcv.data[p] + k_v[current_n+p] = v_rcv.data[p] + end + end + part = linear_indices(row_partition) + parts_snd, parts_rcv = assembly_neighbors(row_partition) + coo_entries_with_column_owner = map(tuple,I,J,Jown,V) + gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays + graph = ExchangeGraph(parts_snd,parts_rcv) + t1 = exchange(gi_snd,graph) + t2 = exchange(gj_snd,graph) + t3 = exchange(jo_snd,graph) + t4 = exchange(v_snd,graph) + @async begin + gi_rcv = fetch(t1) + gj_rcv = fetch(t2) + jo_rcv = fetch(t3) + v_rcv = fetch(t4) + map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv) + I,J,Jown,V + end +end + +# dofs_gids_prange can be either test_gids or trial_gids +# In the former case, gids is a vector of global test dof identifiers, while in the +# latter, a vector of global trial dof identifiers +function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...) + if !ghost + _setup_prange_without_ghosts(dofs_gids_prange) + elseif isa(owners,Nothing) + _setup_prange_with_ghosts(dofs_gids_prange,gids) + else + _setup_prange_with_ghosts(dofs_gids_prange,gids,owners) + end +end + +function _setup_prange( + dofs_gids_prange::AbstractVector{<:PRange}, + gids::AbstractMatrix; + ax=:rows,ghost=true,owners=nothing +) + @check ax ∈ (:rows,:cols) + block_ids = LinearIndices(dofs_gids_prange) + pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x)) + + gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange + gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id] + gids_union = pvcat(gids_slice) + + owners_union = nothing + if !isnothing(owners) + owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id] + owners_union = pvcat(owners_slice) + end + + return gids_union, owners_union + end |> tuple_of_arrays + + return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union) +end + +# Create PRange for the rows of the linear system +# without local ghost dofs as per required in the +# LocallyAssembled() parallel assembly strategy +function _setup_prange_without_ghosts(dofs_gids_prange::PRange) + ngdofs = length(dofs_gids_prange) + indices = map(partition(dofs_gids_prange)) do dofs_indices + owner = part_id(dofs_indices) + own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices)) + ghost_indices = GhostIndices(ngdofs,Int64[],Int32[]) + OwnAndGhostIndices(own_indices,ghost_indices) + end + return PRange(indices) +end + +# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition +# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system. +# The precondition required for the consistency of any parallel assembly process in GridapDistributed +# is that each processor can determine locally with a single layer of ghost cells the global indices and associated +# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities +# (i.e., either cells or faces). +function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids) + ngdofs = length(dofs_gids_prange) + dofs_gids_partition = partition(dofs_gids_prange) + + # Selected ghost ids -> dof PRange ghost ids + gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids) + + # Selected ghost ids -> [global dof ids, owner processor ids] + gids_ghost_to_global, gids_ghost_to_owner = map( + find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition + ) |> tuple_of_arrays + + return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner) +end + +# Here we cannot assume that the sparse communication graph underlying +# trial_dofs_gids_partition is a superset of the one underlying indices. +# Here we chould check whether it is included and call _find_neighbours!() +# if this is the case. At present, we are not taking advantage of this, +# but let the parallel scalable algorithm to compute the reciprocal to do the job. +function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners) + ngdofs = length(dofs_gids_prange) + dofs_gids_partition = partition(dofs_gids_prange) + + # Selected ghost ids -> [global dof ids, owner processor ids] + gids_ghost_to_global, gids_ghost_to_owner = map( + gids,owners,dofs_gids_partition) do gids, owners, indices + ghost_touched = Dict{Int,Bool}() + ghost_to_global = Int64[] + ghost_to_owner = Int64[] + me = part_id(indices) + for (j,jo) in zip(gids,owners) + if jo != me + if !haskey(ghost_touched,j) + push!(ghost_to_global,j) + push!(ghost_to_owner,jo) + ghost_touched[j] = true + end + end + end + ghost_to_global, ghost_to_owner + end |> tuple_of_arrays + + return _setup_prange_impl_( + ngdofs, + dofs_gids_partition, + gids_ghost_to_global, + gids_ghost_to_owner; + discover_neighbours=false + ) +end + +function _setup_prange_impl_( + ngdofs, + dofs_gids_partition, + gids_ghost_to_global, + gids_ghost_to_owner; + discover_neighbours=true +) + indices = map(dofs_gids_partition, + gids_ghost_to_global, + gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner + owner = part_id(dofs_indices) + own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices)) + ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner) + OwnAndGhostIndices(own_indices,ghost_indices) + end + if discover_neighbours + _find_neighbours!(indices,dofs_gids_partition) + end + return PRange(indices) +end diff --git a/test/BlockSparseMatrixAssemblersTests.jl b/test/BlockSparseMatrixAssemblersTests.jl index 82bafc97..645d7dc2 100644 --- a/test/BlockSparseMatrixAssemblersTests.jl +++ b/test/BlockSparseMatrixAssemblersTests.jl @@ -52,7 +52,7 @@ function _main(n_spaces,mfs,weakform,U,V) matdata = collect_cell_matrix(X,Y,biform(u,v)) vecdata = collect_cell_vector(Y,liform(v)) - assem = SparseMatrixAssembler(X,Y,FullyAssembledRows()) + assem = SparseMatrixAssembler(X,Y,LocallyAssembled()) A1 = assemble_matrix(assem,matdata) b1 = assemble_vector(assem,vecdata) A2,b2 = assemble_matrix_and_vector(assem,data); @@ -68,7 +68,7 @@ function _main(n_spaces,mfs,weakform,U,V) bmatdata = collect_cell_matrix(Xb,Yb,biform(ub,vb)) bvecdata = collect_cell_vector(Yb,liform(vb)) - assem_blocks = SparseMatrixAssembler(Xb,Yb,FullyAssembledRows()) + assem_blocks = SparseMatrixAssembler(Xb,Yb,LocallyAssembled()) A1_blocks = assemble_matrix(assem_blocks,bmatdata); b1_blocks = assemble_vector(assem_blocks,bvecdata); @test is_same_vector(b1_blocks,b1,Yb,Y) diff --git a/test/DivConformingTests.jl b/test/DivConformingTests.jl index 2a1886b3..38044564 100644 --- a/test/DivConformingTests.jl +++ b/test/DivConformingTests.jl @@ -63,7 +63,7 @@ function f(model,reffe) V = FESpace(model,reffe,conformity=:Hdiv) U = TrialFESpace(V) - das = FullyAssembledRows() + das = LocallyAssembled() trian = Triangulation(das,model) degree = 2 dΩ = Measure(trian,degree) diff --git a/test/FESpacesTests.jl b/test/FESpacesTests.jl index bf1e78e6..22e35453 100644 --- a/test/FESpacesTests.jl +++ b/test/FESpacesTests.jl @@ -71,8 +71,8 @@ function assemble_tests(das,dΩ,dΩass,U,V) end function main(distribute,parts) - main(distribute,parts,SubAssembledRows()) - main(distribute,parts,FullyAssembledRows()) + main(distribute,parts,Assembled()) + main(distribute,parts,LocallyAssembled()) end function main(distribute,parts,das) diff --git a/test/HeatEquationTests.jl b/test/HeatEquationTests.jl index fa96dc11..89077ecb 100644 --- a/test/HeatEquationTests.jl +++ b/test/HeatEquationTests.jl @@ -37,7 +37,7 @@ function main(distribute,parts) op = TransientFEOperator(res,jac,jac_t,U,V0) - assembler = SparseMatrixAssembler(U,V0,SubAssembledRows()) + assembler = SparseMatrixAssembler(U,V0,Assembled()) op_constant = TransientLinearFEOperator( (a,m),b,U,V0,constant_forms=(true,true),assembler=assembler ) diff --git a/test/PLaplacianTests.jl b/test/PLaplacianTests.jl index 2347dfc4..c64ed23f 100644 --- a/test/PLaplacianTests.jl +++ b/test/PLaplacianTests.jl @@ -8,8 +8,8 @@ using Test using SparseArrays function main(distribute,parts) - main(distribute,parts,FullyAssembledRows(),SparseMatrixCSR{0,Float64,Int}) - main(distribute,parts,SubAssembledRows(),SparseMatrixCSC{Float64,Int}) + main(distribute,parts,LocallyAssembled(),SparseMatrixCSR{0,Float64,Int}) + main(distribute,parts,Assembled(),SparseMatrixCSC{Float64,Int}) end function main(distribute,parts,strategy,local_matrix_type) diff --git a/test/parrays_update.jl b/test/parrays_update.jl index 951a68d7..ae00925e 100644 --- a/test/parrays_update.jl +++ b/test/parrays_update.jl @@ -21,6 +21,8 @@ dist_V = TestFESpace(dist_model,reffe) serial_ids = get_free_dof_ids(serial_V) dist_ids = get_free_dof_ids(dist_V) +dof_ids = get_free_dof_ids(dist_V) + serial_Ω = Triangulation(serial_model) serial_dΩ = Measure(serial_Ω,2) @@ -31,6 +33,55 @@ serial_a1(u,v) = ∫(u⋅v)*serial_dΩ serial_A1 = assemble_matrix(serial_a1,serial_V,serial_V) dist_a1(u,v) = ∫(u⋅v)*dist_dΩ +assem = SparseMatrixAssembler(dist_V,dist_V) dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V) all(centralize(dist_A1) - serial_A1 .< 1e-10) +function PartitionedArrays.precompute_nzindex(A,I,J;skip=false) + K = zeros(Int32,length(I)) + for (p,(i,j)) in enumerate(zip(I,J)) + if !skip && (i < 1 || j < 1) + continue + end + K[p] = nzindex(A,i,j) + end + K +end + + + +############################################################################################ + +ids = cids +indices = partition(ids) + +n_own = own_length(cids) +ghost2global = ghost_to_global(cids) +ghost2owner = ghost_to_owner(cids) + +first_gid = scan(+,n_own,type=:exclusive,init=one(eltype(n_own))) +n_global = reduce(+,n_own) +n_parts = length(ranks) +indices2 = map(ranks,n_own,first_gid,ghost2global,ghost2owner) do rank,n_own,start,ghost2global,ghost2owner + p = CartesianIndex((rank,)) + np = (n_parts,) + n = (n_global,) + ranges = ((1:n_own).+(start-1),) + ghost = GhostIndices(n_global,ghost2global,ghost2owner) + + PartitionedArrays.LocalIndicesWithVariableBlockSize(p,np,n,ranges,ghost) +end + +map(indices,indices2) do indices, indices2 + ghost2local = ghost_to_local(indices) + own2local = own_to_local(indices) + + n_own = own_length(indices) + n_ghost = ghost_length(indices) + + #perm = vcat(own2local,ghost2local) + perm = fill(0,local_length(indices)) + perm[own2local] .= 1:n_own + perm[ghost2local] .= (n_own+1):(n_ghost+n_own) + permute_indices(indices2,perm) +end From da06a4b4a276897863a82275b7c431a7f2beeaef Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Sun, 8 Sep 2024 00:31:31 +1000 Subject: [PATCH 05/14] Saving progress --- src/Assembly.jl | 137 ++++++++++++++++++++++++++++++++++++++++--- src/PArraysExtras.jl | 55 ++++++++++++++++- 2 files changed, 182 insertions(+), 10 deletions(-) diff --git a/src/Assembly.jl b/src/Assembly.jl index 25e2411f..10e6d976 100644 --- a/src/Assembly.jl +++ b/src/Assembly.jl @@ -84,6 +84,20 @@ local_views(a::DistributedAllocation) = a.allocs const PVectorAllocation{S,T} = DistributedAllocation{S,T,1} const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2} +function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation}) + map(local_views(a),partition(axes(a,1))) do a, ids + rows = remove_ghost(unpermute(ids)) + + ghost_lids = ghost_to_local(ids) + ghost_owners = ghost_to_owner(ids) + touched_ghost_lids = filter(lid -> a.touched[lid],ghost_lids) + touched_ghost_owners = collect(ghost_owners[touched_ghost_lids]) + touched_ghost_gids = to_global!(touched_ghost_lids,ids) + ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners) + return replace_ghost(rows,ghost) + end +end + # PSparseMatrix assembly chain # # 1 - nz_counter(PSparseMatrixBuilder) -> PSparseMatrixCounter @@ -104,6 +118,21 @@ function Algebra.nz_allocation(a::PSparseMatrixCounter) DistributedAllocation(allocs,a.axes,a.strategy) end +function Algebra.create_from_nz(a::PSparseMatrix) + assemble!(a) |> wait + return a +end + +function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled}) + A, = create_from_nz_locally_assembled(a) + return A +end + +function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled}) + A, = create_from_nz_assembled(a,nothing) + return A +end + # PVector assembly chain: # # 1 - nz_counter(PVectorBuilder) -> PVectorCounter @@ -142,13 +171,16 @@ function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVecto end function Algebra.create_from_nz(a::PVectorAllocation{<:LocallyAssembled,<:AbstractVector}) - rows = _setup_prange_without_ghosts(axes(a,1)) - _rhs_callback(a,rows) + rows = remove_ghost(unpermute(axes(a,1))) + values = local_views(a) + b = rhs_callback(values,rows) + return b end function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S - rows = _setup_prange_from_pvector_allocation(a) - b = _rhs_callback(a,rows) + rows = collect_touched_ids(a) + values = map(ai -> ai.values, local_views(a)) + b = rhs_callback(values,rows) t2 = assemble!(b) if t2 !== nothing wait(t2) @@ -162,7 +194,7 @@ end # overlap communications the assembly of the matrix and the vector. # Not only it is faster, but also necessary to ensure identical ghost indices # in both the matrix and vector rows. -# This is done by using the following specializations chain: +# This is done by using the following specializations: function Arrays.nz_allocation( a::PSparseMatrixCounter{<:Assembled}, @@ -178,9 +210,12 @@ function Algebra.create_from_nz( b::PVectorAllocation{<:LocallyAssembled,<:AbstractVector} ) function callback(rows) - _rhs_callback(b,rows) + new_indices = partition(rows) + values = local_views(b) + b_fespace = PVector(values,partition(axes(b,1))) + locally_repartition(b_fespace,new_indices) end - A, B = _fa_create_from_nz_with_callback(callback,a) + A, B = create_from_nz_locally_assembled(a,callback) return A, B end @@ -189,11 +224,95 @@ function Algebra.create_from_nz( b::PVectorAllocation{<:Assembled,<:TrackedArrayAllocation} ) function callback(rows) - _rhs_callback(b,rows) + new_indices = collect_touched_ids(b) + values = map(ai -> ai.values, local_views(b)) + b_fespace = PVector(values,partition(axes(b,1))) + locally_repartition(b_fespace,new_indices) end function async_callback(b) assemble!(b) end - A, B = _sa_create_from_nz_with_callback(callback,async_callback,a,b) + A, B = create_from_nz_assembled(a,callback,async_callback) return A, B end + +# Assembly methods + +function create_from_nz_locally_assembled( + a, + callback::Function = rows -> nothing +) + # Recover some data + I,J,V = get_allocations(a) + test_gids, trial_gids = axes(a) + + rows = remove_ghost(unpermute(test_gids)) + b = callback(rows) + + # convert I and J to global dof ids + to_global_indices!(I,test_gids;ax=:rows) + to_global_indices!(J,trial_gids;ax=:cols) + + # Create the range for cols + cols = _setup_prange(trial_gids,J;ax=:cols) + + # Convert again I,J to local numeration + to_local_indices!(I,rows;ax=:rows) + to_local_indices!(J,cols;ax=:cols) + + A = _setup_matrix(I,J,V,rows,cols) + return A, b +end + +function create_from_nz_assembled( + a, + callback::Function = rows -> nothing, + async_callback::Function = b -> nothing +) + # Recover some data + I,J,V = get_allocations(a) + test_gids = get_test_gids(a) + trial_gids = get_trial_gids(a) + + # convert I and J to global dof ids + to_global_indices!(I,test_gids;ax=:rows) + to_global_indices!(J,trial_gids;ax=:cols) + + # Create the Prange for the rows + rows = _setup_prange(test_gids,I;ax=:rows) + + # Move (I,J,V) triplets to the owner process of each row I. + # The triplets are accompanyed which Jo which is the process column owner + Jo = get_gid_owners(J,trial_gids;ax=:cols) + t = _assemble_coo!(I,J,V,rows;owners=Jo) + + # Here we can overlap computations + # This is a good place to overlap since + # sending the matrix rows is a lot of data + if !isa(b,Nothing) + bprange=_setup_prange_from_pvector_allocation(b) + b = callback(bprange) + end + + # Wait the transfer to finish + wait(t) + + # Create the Prange for the cols + cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo) + + # Overlap rhs communications with CSC compression + t2 = async_callback(b) + + # Convert again I,J to local numeration + to_local_indices!(I,rows;ax=:rows) + to_local_indices!(J,cols;ax=:cols) + + A = _setup_matrix(a,I,J,V,rows,cols) + + # Wait the transfer to finish + if !isa(t2,Nothing) + wait(t2) + end + + return A, b +end diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl index f4a5ae2e..1a1d84de 100644 --- a/src/PArraysExtras.jl +++ b/src/PArraysExtras.jl @@ -45,10 +45,63 @@ function permuted_variable_partition( end end +""" + unpermute(indices::AbstractLocalIndices) + +Given local indices, reorders them to (locally) have own indices first, +followed by ghost indices. +""" +function unpermute(indices::AbstractLocalIndices) + @notimplemented +end + +unpermute(indices::PermutedLocalIndices) = unpermute(indices.indices) +unpermute(indices::PartitionedArrays.LocalIndicesInBlockPartition) = indices +unpermute(indices::OwnAndGhostIndices) = indices + +function unpermute(indices::LocalIndices) + nglobal = global_length(indices) + rank = part_id(indices) + own = OwnIndices(nglobal,rank,own_to_global(indices)) + ghost = GhostIndices(nglobal,ghost_to_global(indices),ghost_to_owner(indices)) + OwnAndGhostIndices(own,ghost,global_to_owner(indices)) +end + +""" + locally_repartition(v::PVector,new_indices) + +Map the values of a PVector to a new partitioning of the indices. + +Similar to `PartitionedArrays.repartition`, but without any communications. Instead, +it is assumed that the local-to-local mapping can be done locally. +""" +function locally_repartition(v::PVector,new_indices) + w = similar(v,PRange(new_indices)) + locally_repartition!(w,v) +end + +function locally_repartition!(w::PVector,v::PVector) + # Fill own values + map(copy!,own_values(w),own_values(v)) + + # Fill ghost values + new_indices = partition(axes(w,1)) + old_indices = partition(axes(v,1)) + map(partition(w),partition(v),new_indices,old_indices) do w,v,new_ids,old_ids + old_gid_to_lid = global_to_local(old_ids) + for (lid,gid) in zip(ghost_to_local(new_ids),ghost_to_global(new_ids)) + old_lid = old_gid_to_lid[gid] + w[lid] = v[old_lid] + end + end + + return w +end + # Linear algebra function LinearAlgebra.axpy!(α,x::PVector,y::PVector) - @check partition(axes(x,1)) === partition(axes(y,1)) + @check matching_local_indices(partition(axes(x,1)),partition(axes(y,1))) map(partition(x),partition(y)) do x,y LinearAlgebra.axpy!(α,x,y) end From fe547432c393ef7bbff1da8359efd72759e41376 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Tue, 17 Sep 2024 13:50:20 +1000 Subject: [PATCH 06/14] Minor --- src/Assembly.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Assembly.jl b/src/Assembly.jl index 10e6d976..f440fd49 100644 --- a/src/Assembly.jl +++ b/src/Assembly.jl @@ -236,7 +236,7 @@ function Algebra.create_from_nz( return A, B end -# Assembly methods +# Low-level assembly methods function create_from_nz_locally_assembled( a, From 8bacdf25f109c408861c92060774dc594ef80033 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Tue, 22 Oct 2024 01:02:47 +1100 Subject: [PATCH 07/14] More fixes --- src/Assembly.jl | 43 ++++++++++++++++++++++++---------------- src/FESpaces.jl | 12 ++++------- src/GridapDistributed.jl | 4 ++-- src/PArraysExtras.jl | 30 ++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 27 deletions(-) diff --git a/src/Assembly.jl b/src/Assembly.jl index f440fd49..1797a996 100644 --- a/src/Assembly.jl +++ b/src/Assembly.jl @@ -42,7 +42,7 @@ struct DistributedCounter{S,T,N,A,B} <: GridapType function DistributedCounter( counters :: AbstractArray{T}, axes :: NTuple{N,<:PRange}, - strategy :: Algebra.AssemblyStrategy + strategy :: AssemblyStrategy ) where {T,N} A, B, S = typeof(counters), typeof(axes), typeof(strategy) new{S,T,N,A,B}(counters,axes,strategy) @@ -52,10 +52,10 @@ end Base.axes(a::DistributedCounter) = a.axes local_views(a::DistributedCounter) = a.counters -const PVectorCounter{S,T} = DistributedCounter{S,T<:Algebra.ArrayCounter,1} +const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B} Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop() -const PSparseMatrixCounter{S,T} = DistributedCounter{S,T<:Algebra.CounterCOO,2} +const PSparseMatrixCounter{S,T,A,B} = DistributedCounter{S,T,2,A,B} Algebra.LoopStyle(::Type{<:PSparseMatrixCounter}) = Loop() """ @@ -71,7 +71,7 @@ struct DistributedAllocation{S,T,N,A,B} <: GridapType function DistributedAllocation( allocs :: AbstractArray{T}, axes :: NTuple{N,<:PRange}, - strategy :: Algebra.AssemblyStrategy + strategy :: AssemblyStrategy ) where {T,N} A, B, S = typeof(allocs), typeof(axes), typeof(strategy) new{S,T,N,A,B}(allocs,axes,strategy) @@ -84,6 +84,13 @@ local_views(a::DistributedAllocation) = a.allocs const PVectorAllocation{S,T} = DistributedAllocation{S,T,1} const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2} +function get_allocations(a::PSparseMatrixAllocation{S,<:Algebra.AllocationCOO}) where S + I,J,V = map(local_views(a)) do alloc + alloc.I, alloc.J, alloc.V + end |> tuple_of_arrays + return I,J,V +end + function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation}) map(local_views(a),partition(axes(a,1))) do a, ids rows = remove_ghost(unpermute(ids)) @@ -104,13 +111,13 @@ end # 2 - nz_allocation(PSparseMatrixCounter) -> PSparseMatrixAllocation # 3 - create_from_nz(PSparseMatrixAllocation) -> PSparseMatrix -function Algebra.nz_counter(builder::PSparseMatrixBuilder,axs::NTuple{2,<:PRange}) +function Algebra.nz_counter(builder::PSparseMatrixBuilder{MT},axs::NTuple{2,<:PRange}) where MT rows, cols = axs # test ids, trial ids - counters = map(partition(rows),partition(cols)) do rows,cols - axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols))) - Algebra.CounterCOO{A}(axs) + counters = map(partition(rows),partition(cols)) do rows, cols + local_axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols))) + Algebra.CounterCOO{MT}(local_axs) end - DistributedCounter(builder.par_strategy,counters,rows,cols) + DistributedCounter(counters,axs,builder.strategy) end function Algebra.nz_allocation(a::PSparseMatrixCounter) @@ -129,7 +136,9 @@ function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled}) end function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled}) - A, = create_from_nz_assembled(a,nothing) + callback(rows) = nothing + async_callback(b) = nothing + A, = create_from_nz_assembled(a,callback,async_callback) return A end @@ -180,8 +189,8 @@ end function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S rows = collect_touched_ids(a) values = map(ai -> ai.values, local_views(a)) - b = rhs_callback(values,rows) - t2 = assemble!(b) + b = rhs_callback(values,rows) + t2 = assemble!(b) if t2 !== nothing wait(t2) end @@ -271,19 +280,19 @@ function create_from_nz_assembled( ) # Recover some data I,J,V = get_allocations(a) - test_gids = get_test_gids(a) - trial_gids = get_trial_gids(a) + test_gids, trial_gids = axes(a) # convert I and J to global dof ids to_global_indices!(I,test_gids;ax=:rows) to_global_indices!(J,trial_gids;ax=:cols) # Create the Prange for the rows - rows = _setup_prange(test_gids,I;ax=:rows) + rows = unpermute(test_gids) + # rows = replace_ghost(unpermute(test_gids),I,find_owner(test_gids,I)) # Move (I,J,V) triplets to the owner process of each row I. - # The triplets are accompanyed which Jo which is the process column owner - Jo = get_gid_owners(J,trial_gids;ax=:cols) + J_owners = find_owner(trial_gids,J) + cols = union_ghost(unpermute(trial_gids),J,J_owners) t = _assemble_coo!(I,J,V,rows;owners=Jo) # Here we can overlap computations diff --git a/src/FESpaces.jl b/src/FESpaces.jl index 02530cfa..e7aabe3d 100644 --- a/src/FESpaces.jl +++ b/src/FESpaces.jl @@ -603,15 +603,11 @@ FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_buil FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) - rows = get_rows(a) - cols = get_cols(a) - map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) + map(symbolic_loop_matrix!,local_views(A),local_views(a),matdata) end function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) - rows = get_rows(a) - cols = get_cols(a) - map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) + map(numeric_loop_matrix!,local_views(A),local_views(a),matdata) end function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) @@ -656,7 +652,7 @@ function FESpaces.assemble_matrix!(mat::PSparseMatrix,a::DistributedSparseMatrix create_from_nz!(mat,allocs,cache) end -function create_from_nz!(A,a::DistributedAllocationCOO{<:Assembled},cache) +function create_from_nz!(A,a::DistributedAllocation{<:Assembled},cache) _,_,V = get_allocations(a) psparse!(A,V,cache) |> wait return A @@ -699,7 +695,7 @@ function FESpaces.SparseMatrixAssembler( local_strategy ) end - mat_builder = PSparseMatrixBuilderCOO(local_mat_type,par_strategy) + mat_builder = PSparseMatrixBuilder(local_mat_type,par_strategy) vec_builder = PVectorBuilder(local_vec_type,par_strategy) if reuse_caches caches = Dict{UInt64,Any}() diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl index b55ddfe8..4e31dcd3 100644 --- a/src/GridapDistributed.jl +++ b/src/GridapDistributed.jl @@ -42,11 +42,11 @@ export with_ghost, no_ghost export redistribute +include("BlockPartitionedArrays.jl") + include("GridapExtras.jl") include("PArraysExtras.jl") -include("BlockPartitionedArrays.jl") - include("Algebra.jl") include("Assembly.jl") diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl index 1a1d84de..19c97149 100644 --- a/src/PArraysExtras.jl +++ b/src/PArraysExtras.jl @@ -161,6 +161,36 @@ function to_parray_of_arrays(a::AbstractArray{<:DebugArray}) end end +# To local/to global for blocks + +function to_local_indices!(I,ids::PRange;kwargs...) + map(to_local!,I,partition(ids)) +end + +function to_global_indices!(I,ids::PRange;kwargs...) + map(to_global!,I,partition(ids)) +end +for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners] + @eval begin + function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...) + map($f,I,ids) + end + + function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows) + @check ax ∈ [:rows,:cols] + block_ids = CartesianIndices(I) + map(block_ids) do id + i = id[1]; j = id[2]; + if ax == :rows + $f(I[i,j],ids[i]) + else + $f(I[i,j],ids[j]) + end + end + end + end +end + # This type is required because MPIArray from PArrays # cannot be instantiated with a NULL communicator struct MPIVoidVector{T} <: AbstractVector{T} From a21abd7a6438dc60fc8eed3bd4ed853636a851e5 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Tue, 5 Nov 2024 19:02:35 +1100 Subject: [PATCH 08/14] We can assemble again! --- src/Assembly.jl | 52 +++++++++++++++++++++++------------------- src/GridapExtras.jl | 13 +++++++++++ src/PArraysExtras.jl | 13 +++++++++++ test/parrays_update.jl | 14 ++++++++++++ 4 files changed, 69 insertions(+), 23 deletions(-) diff --git a/src/Assembly.jl b/src/Assembly.jl index 1797a996..e0b33745 100644 --- a/src/Assembly.jl +++ b/src/Assembly.jl @@ -50,6 +50,7 @@ struct DistributedCounter{S,T,N,A,B} <: GridapType end Base.axes(a::DistributedCounter) = a.axes +Base.axes(a::DistributedCounter,d::Integer) = a.axes[d] local_views(a::DistributedCounter) = a.counters const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B} @@ -79,8 +80,18 @@ struct DistributedAllocation{S,T,N,A,B} <: GridapType end Base.axes(a::DistributedAllocation) = a.axes +Base.axes(a::DistributedAllocation,d::Integer) = a.axes[d] local_views(a::DistributedAllocation) = a.allocs +function change_axes(a::DistributedAllocation{S,T,N},axes::NTuple{N,<:PRange}) where {S,T,N} + indices = map(partition,axes) + local_axes = map(indices...) do indices... + map(ids -> Base.OneTo(local_length(ids)), indices) + end + allocs = map(change_axes,a.allocs,local_axes) + DistributedAllocation(allocs,axes,a.strategy) +end + const PVectorAllocation{S,T} = DistributedAllocation{S,T,1} const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2} @@ -97,9 +108,9 @@ function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation}) ghost_lids = ghost_to_local(ids) ghost_owners = ghost_to_owner(ids) - touched_ghost_lids = filter(lid -> a.touched[lid],ghost_lids) + touched_ghost_lids = filter(lid -> a.touched[lid], ghost_lids) touched_ghost_owners = collect(ghost_owners[touched_ghost_lids]) - touched_ghost_gids = to_global!(touched_ghost_lids,ids) + touched_ghost_gids = to_global!(touched_ghost_lids, ids) ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners) return replace_ghost(rows,ghost) end @@ -280,43 +291,38 @@ function create_from_nz_assembled( ) # Recover some data I,J,V = get_allocations(a) - test_gids, trial_gids = axes(a) + test_ids = partition(axes(a,1)) + trial_ids = partition(axes(a,2)) # convert I and J to global dof ids - to_global_indices!(I,test_gids;ax=:rows) - to_global_indices!(J,trial_gids;ax=:cols) + map(map_local_to_global!,I,test_ids) + map(map_local_to_global!,J,trial_ids) - # Create the Prange for the rows - rows = unpermute(test_gids) + # Move (I,J,V) triplets to the owner process of each row I. + rows = map(unpermute,test_ids) # rows = replace_ghost(unpermute(test_gids),I,find_owner(test_gids,I)) + t = PartitionedArrays.assemble_coo!(I,J,V,rows) - # Move (I,J,V) triplets to the owner process of each row I. - J_owners = find_owner(trial_gids,J) - cols = union_ghost(unpermute(trial_gids),J,J_owners) - t = _assemble_coo!(I,J,V,rows;owners=Jo) - - # Here we can overlap computations - # This is a good place to overlap since - # sending the matrix rows is a lot of data - if !isa(b,Nothing) - bprange=_setup_prange_from_pvector_allocation(b) - b = callback(bprange) - end + # Overlap CSC communication with rhs assembly + b = callback(rows) # Wait the transfer to finish wait(t) # Create the Prange for the cols - cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo) + J_owners = find_owner(trial_ids,J) + cols = map(union_ghost,map(unpermute,trial_ids),J,J_owners) # Overlap rhs communications with CSC compression t2 = async_callback(b) # Convert again I,J to local numeration - to_local_indices!(I,rows;ax=:rows) - to_local_indices!(J,cols;ax=:cols) + map(map_global_to_local!,I,rows) + map(map_global_to_local!,J,cols) - A = _setup_matrix(a,I,J,V,rows,cols) + a_sys = change_axes(a,(PRange(rows),PRange(cols))) + values = map(create_from_nz,local_views(a_sys)) + A = PSparseMatrix(values,rows,cols,true) # Wait the transfer to finish if !isa(t2,Nothing) diff --git a/src/GridapExtras.jl b/src/GridapExtras.jl index df5b1186..e0723189 100644 --- a/src/GridapExtras.jl +++ b/src/GridapExtras.jl @@ -53,3 +53,16 @@ end end nothing end + +# change_axes + +function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A} + b = Algebra.CounterCOO{T}(axes) + b.nnz = a.nnz + b +end + +function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A} + counter = change_axes(a.counter,axes) + Algebra.AllocationCOO(counter,a.I,a.J,a.V) +end diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl index 19c97149..be907629 100644 --- a/src/PArraysExtras.jl +++ b/src/PArraysExtras.jl @@ -98,6 +98,19 @@ function locally_repartition!(w::PVector,v::PVector) return w end +# SubSparseMatrix extensions + +function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix) + I,J,V = findnz(A.parent) + rowmap, colmap = A.inv_indices + for k in eachindex(I) + I[k] = rowmap[I[k]] + J[k] = colmap[J[k]] + end + mask = map((i,j) -> (i > 0 && j > 0), I, J) + return I[mask], J[mask], V[mask] +end + # Linear algebra function LinearAlgebra.axpy!(α,x::PVector,y::PVector) diff --git a/test/parrays_update.jl b/test/parrays_update.jl index ae00925e..9e396812 100644 --- a/test/parrays_update.jl +++ b/test/parrays_update.jl @@ -37,6 +37,8 @@ assem = SparseMatrixAssembler(dist_V,dist_V) dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V) all(centralize(dist_A1) - serial_A1 .< 1e-10) +centralize(dist_A1) + function PartitionedArrays.precompute_nzindex(A,I,J;skip=false) K = zeros(Int32,length(I)) for (p,(i,j)) in enumerate(zip(I,J)) @@ -48,7 +50,19 @@ function PartitionedArrays.precompute_nzindex(A,I,J;skip=false) K end +Aoo = own_own_values(dist_A1).items[1] +using SparseArrays +function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix) + I,J,V = findnz(A.parent) + rowmap, colmap = A.inv_indices + for k in eachindex(I) + I[k] = rowmap[I[k]] + J[k] = colmap[J[k]] + end + mask = map((i,j) -> (i > 0 && j > 0), I, J) + return I[mask], J[mask], V[mask] +end ############################################################################################ From 07cb9a4a307527682374cdbcb8080680b4f570e1 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Tue, 5 Nov 2024 19:45:40 +1100 Subject: [PATCH 09/14] Added the rest of assembly startegies --- src/Assembly.jl | 73 +++++++++++++++++++++++++------------------- src/PArraysExtras.jl | 4 +++ 2 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/Assembly.jl b/src/Assembly.jl index e0b33745..a5f10c9c 100644 --- a/src/Assembly.jl +++ b/src/Assembly.jl @@ -262,34 +262,37 @@ function create_from_nz_locally_assembled( a, callback::Function = rows -> nothing ) - # Recover some data I,J,V = get_allocations(a) - test_gids, trial_gids = axes(a) + test_ids = partition(axes(a,1)) + trial_ids = partition(axes(a,2)) - rows = remove_ghost(unpermute(test_gids)) + rows = map(remove_ghost,map(unpermute,test_ids)) b = callback(rows) - # convert I and J to global dof ids - to_global_indices!(I,test_gids;ax=:rows) - to_global_indices!(J,trial_gids;ax=:cols) + map(map_local_to_global!,I,test_ids) + map(map_local_to_global!,J,trial_ids) - # Create the range for cols - cols = _setup_prange(trial_gids,J;ax=:cols) + # TODO: replace_ghost or union_ghost? + # Actually, do we want to change the ghosts at all? We could return the unpermuted trial_ids + J_owners = find_owner(trial_ids,J) + cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners) - # Convert again I,J to local numeration - to_local_indices!(I,rows;ax=:rows) - to_local_indices!(J,cols;ax=:cols) + map(map_global_to_local!,I,rows) + map(map_global_to_local!,J,cols) + + assembled = true + a_sys = change_axes(a,(PRange(rows),PRange(cols))) + values = map(create_from_nz,local_views(a_sys)) + A = PSparseMatrix(values,rows,cols,assembled) - A = _setup_matrix(I,J,V,rows,cols) return A, b end function create_from_nz_assembled( - a, + a, callback::Function = rows -> nothing, - async_callback::Function = b -> nothing + async_callback::Function = b -> empty_async_task ) - # Recover some data I,J,V = get_allocations(a) test_ids = partition(axes(a,1)) trial_ids = partition(axes(a,2)) @@ -298,36 +301,44 @@ function create_from_nz_assembled( map(map_local_to_global!,I,test_ids) map(map_local_to_global!,J,trial_ids) - # Move (I,J,V) triplets to the owner process of each row I. + # Overlapped COO communication and vector assembly rows = map(unpermute,test_ids) - # rows = replace_ghost(unpermute(test_gids),I,find_owner(test_gids,I)) t = PartitionedArrays.assemble_coo!(I,J,V,rows) - - # Overlap CSC communication with rhs assembly b = callback(rows) - - # Wait the transfer to finish wait(t) - # Create the Prange for the cols - J_owners = find_owner(trial_ids,J) - cols = map(union_ghost,map(unpermute,trial_ids),J,J_owners) - # Overlap rhs communications with CSC compression t2 = async_callback(b) + J_owners = find_owner(trial_ids,J) + cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners) # TODO: replace_ghost or union_ghost? - # Convert again I,J to local numeration map(map_global_to_local!,I,rows) map(map_global_to_local!,J,cols) + assembled = true a_sys = change_axes(a,(PRange(rows),PRange(cols))) values = map(create_from_nz,local_views(a_sys)) - A = PSparseMatrix(values,rows,cols,true) + A = PSparseMatrix(values,rows,cols,assembled) - # Wait the transfer to finish - if !isa(t2,Nothing) - wait(t2) - end + wait(t2) + return A, b +end + +function create_from_nz_subassembled( + a, + callback::Function = rows -> nothing, + async_callback::Function = b -> empty_async_task +) + rows = partition(axes(a,1)) + cols = partition(axes(a,2)) + + b = callback(rows) + t2 = async_callback(b) + + assembled = false + values = map(create_from_nz,local_views(a)) + A = PSparseMatrix(values,rows,cols,assembled) + wait(t2) return A, b end diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl index be907629..4d87e2b0 100644 --- a/src/PArraysExtras.jl +++ b/src/PArraysExtras.jl @@ -111,6 +111,10 @@ function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix) return I[mask], J[mask], V[mask] end +# Async tasks + +const empty_async_task = PartitionedArrays.FakeTask(x -> nothing) + # Linear algebra function LinearAlgebra.axpy!(α,x::PVector,y::PVector) From a820d9a6979b68f5808e784e0816530c430b1c45 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Tue, 5 Nov 2024 23:31:50 +1100 Subject: [PATCH 10/14] Bugfixes --- src/Assembly.jl | 9 +++---- src/PArraysExtras.jl | 53 +++++++++++++++++++++++++++++++++++++++++- test/parrays_update.jl | 17 ++------------ 3 files changed, 57 insertions(+), 22 deletions(-) diff --git a/src/Assembly.jl b/src/Assembly.jl index a5f10c9c..021e8151 100644 --- a/src/Assembly.jl +++ b/src/Assembly.jl @@ -147,9 +147,7 @@ function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled}) end function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled}) - callback(rows) = nothing - async_callback(b) = nothing - A, = create_from_nz_assembled(a,callback,async_callback) + A, = create_from_nz_assembled(a) return A end @@ -302,15 +300,14 @@ function create_from_nz_assembled( map(map_local_to_global!,J,trial_ids) # Overlapped COO communication and vector assembly - rows = map(unpermute,test_ids) + rows = filter_and_replace_ghost(map(unpermute,test_ids),I) t = PartitionedArrays.assemble_coo!(I,J,V,rows) b = callback(rows) wait(t) # Overlap rhs communications with CSC compression t2 = async_callback(b) - J_owners = find_owner(trial_ids,J) - cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners) # TODO: replace_ghost or union_ghost? + cols = filter_and_replace_ghost(map(unpermute,trial_ids),J) map(map_global_to_local!,I,rows) map(map_global_to_local!,J,cols) diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl index 4d87e2b0..05c82163 100644 --- a/src/PArraysExtras.jl +++ b/src/PArraysExtras.jl @@ -98,6 +98,57 @@ function locally_repartition!(w::PVector,v::PVector) return w end +""" + filter_and_replace_ghost(indices,gids) + +Replace ghost ids in `indices` with the ghost ids within `gids`. +""" +function filter_and_replace_ghost(indices,gids) + owners = find_owner(indices,gids) + new_indices = map(indices,gids,owners) do indices, gids, owners + ghost_gids, ghost_owners = _filter_ghost(indices,gids,owners) + replace_ghost(indices, ghost_gids, ghost_owners) + end + return new_indices +end + +# Same as PartitionedArrays.filter_ghost, but we do not exclude ghost indices that +# belong to `indices`. +function _filter_ghost(indices,gids,owners) + ghosts = Set{Int}() + part_owner = part_id(indices) + + n_ghost = 0 + for (gid,owner) in zip(gids,owners) + if gid < 1 + continue + end + if (owner != part_owner) && !(gid in ghosts) + n_ghost += 1 + push!(ghosts,gid) + end + end + + new_ghost_to_global = zeros(Int,n_ghost) + new_ghost_to_owner = zeros(Int32,n_ghost) + + empty!(ghosts) + n_ghost = 0 + for (gid,owner) in zip(gids,owners) + if gid < 1 + continue + end + if (owner != part_owner) && !(gid in ghosts) + n_ghost += 1 + new_ghost_to_global[n_ghost] = gid + new_ghost_to_owner[n_ghost] = owner + push!(ghosts,gid) + end + end + + return new_ghost_to_global, new_ghost_to_owner +end + # SubSparseMatrix extensions function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix) @@ -113,7 +164,7 @@ end # Async tasks -const empty_async_task = PartitionedArrays.FakeTask(x -> nothing) +const empty_async_task = PartitionedArrays.FakeTask(() -> nothing) # Linear algebra diff --git a/test/parrays_update.jl b/test/parrays_update.jl index 9e396812..fd62332b 100644 --- a/test/parrays_update.jl +++ b/test/parrays_update.jl @@ -37,7 +37,8 @@ assem = SparseMatrixAssembler(dist_V,dist_V) dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V) all(centralize(dist_A1) - serial_A1 .< 1e-10) -centralize(dist_A1) + +############################################################################################ function PartitionedArrays.precompute_nzindex(A,I,J;skip=false) K = zeros(Int32,length(I)) @@ -50,20 +51,6 @@ function PartitionedArrays.precompute_nzindex(A,I,J;skip=false) K end -Aoo = own_own_values(dist_A1).items[1] - -using SparseArrays -function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix) - I,J,V = findnz(A.parent) - rowmap, colmap = A.inv_indices - for k in eachindex(I) - I[k] = rowmap[I[k]] - J[k] = colmap[J[k]] - end - mask = map((i,j) -> (i > 0 && j > 0), I, J) - return I[mask], J[mask], V[mask] -end - ############################################################################################ ids = cids From 7b3a94fb03b27034a803055721979860871ebc5d Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Tue, 5 Nov 2024 23:39:15 +1100 Subject: [PATCH 11/14] MInor --- src/Assembly.jl | 5 +---- src/PArraysExtras.jl | 6 +++++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Assembly.jl b/src/Assembly.jl index 021e8151..a538eaa1 100644 --- a/src/Assembly.jl +++ b/src/Assembly.jl @@ -270,10 +270,7 @@ function create_from_nz_locally_assembled( map(map_local_to_global!,I,test_ids) map(map_local_to_global!,J,trial_ids) - # TODO: replace_ghost or union_ghost? - # Actually, do we want to change the ghosts at all? We could return the unpermuted trial_ids - J_owners = find_owner(trial_ids,J) - cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners) + cols = filter_and_replace_ghost(map(unpermute,trial_ids),J) map(map_global_to_local!,I,rows) map(map_global_to_local!,J,cols) diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl index 05c82163..b139a547 100644 --- a/src/PArraysExtras.jl +++ b/src/PArraysExtras.jl @@ -102,6 +102,10 @@ end filter_and_replace_ghost(indices,gids) Replace ghost ids in `indices` with the ghost ids within `gids`. + +NOTE: The issue is that `replace_ghost` does not check if all gids are ghosts or whether +they are repeated. It also shifts ownership of the ghosts. Its all quite messy and not what we +would need. TODO: Make me better. """ function filter_and_replace_ghost(indices,gids) owners = find_owner(indices,gids) @@ -113,7 +117,7 @@ function filter_and_replace_ghost(indices,gids) end # Same as PartitionedArrays.filter_ghost, but we do not exclude ghost indices that -# belong to `indices`. +# belong to `indices`. This could eventually be a flag in the original function. function _filter_ghost(indices,gids,owners) ghosts = Set{Int}() part_owner = part_id(indices) From 9c5b9682e8ac0d006815b2b995f41c7cfca9f4b1 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Wed, 26 Mar 2025 12:48:56 +1100 Subject: [PATCH 12/14] Saving changes --- src/Algebra.jl | 78 +++++++++++++++++- src/Assembly.jl | 174 +++++++++++++++++++++++++++++++++++------ src/FESpaces.jl | 148 ----------------------------------- src/PArraysExtras.jl | 20 ++++- test/parrays_update.jl | 46 +++++++++-- 5 files changed, 286 insertions(+), 180 deletions(-) diff --git a/src/Algebra.jl b/src/Algebra.jl index 3fa0b502..64410c95 100644 --- a/src/Algebra.jl +++ b/src/Algebra.jl @@ -68,9 +68,11 @@ end # change_ghost function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T - same_partition = (a.index_partition === partition(ids)) - a_new = same_partition ? a : change_ghost(T,a,ids) - if make_consistent && (!same_partition || !is_consistent) + if (a.index_partition === partition(ids)) + return a + end + a_new = change_ghost(T,a,ids) + if make_consistent && !is_consistent consistent!(a_new) |> wait end return a_new @@ -98,3 +100,73 @@ function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_ end return BlockPVector(vals,ids) end + +# This type is required in order to be able to access the local portion +# of distributed sparse matrices and vectors using local indices from the +# distributed test and trial spaces +struct LocalView{T,N,A} <:AbstractArray{T,N} + plids_to_value::A + d_to_lid_to_plid::NTuple{N,Vector{Int32}} + local_size::NTuple{N,Int} + function LocalView( + plids_to_value::AbstractArray{T,N}, d_to_lid_to_plid::NTuple{N,Vector{Int32}} + ) where {T,N} + A = typeof(plids_to_value) + local_size = map(length,d_to_lid_to_plid) + new{T,N,A}(plids_to_value,d_to_lid_to_plid,local_size) + end +end + +Base.size(a::LocalView) = a.local_size +Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian() +function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N} + plids = map(getindex,a.d_to_lid_to_plid,lids) + if all(i->i>0,plids) + a.plids_to_value[plids...] + else + zero(T) + end +end +function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N} + plids = map(getindex,a.d_to_lid_to_plid,lids) + @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion" + a.plids_to_value[plids...] = v +end + +function local_views(a::PVector,new_rows::PRange) + old_rows = axes(a,1) + if partition(old_rows) === partition(new_rows) + partition(a) + else + map(partition(a),partition(old_rows),partition(new_rows)) do a,old_rows,new_rows + LocalView(a,(local_to_local_map(new_rows,old_rows),)) + end + end +end + +function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange) + old_rows, old_cols = axes(a) + if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) ) + partition(a) + else + map( + partition(a),partition(old_rows),partition(old_cols),partition(new_rows),partition(new_cols) + ) do a,old_rows,old_cols,new_rows,new_cols + rl2lmap = local_to_local_map(new_rows,old_rows) + cl2lmap = local_to_local_map(new_cols,old_cols) + LocalView(a,(rl2lmap,cl2lmap)) + end + end +end + +function local_views(a::BlockPVector,new_rows::BlockPRange) + vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays + return map(mortar,vals) +end + +function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange) + vals = map(CartesianIndices(blocksize(a))) do I + local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I]) + end |> to_parray_of_arrays + return map(mortar,vals) +end diff --git a/src/Assembly.jl b/src/Assembly.jl index a538eaa1..165d41c4 100644 --- a/src/Assembly.jl +++ b/src/Assembly.jl @@ -5,23 +5,36 @@ struct Assembled <: FESpaces.AssemblyStrategy end struct SubAssembled <: FESpaces.AssemblyStrategy end struct LocallyAssembled <: FESpaces.AssemblyStrategy end +function local_assembly_strategy(::Union{Assembled,SubAssembled},rows,cols) + DefaultAssemblyStrategy() +end + +# When LocallyAssembling, make sure that you also loop over ghost cells. +function local_assembly_strategy(::LocallyAssembled,rows,cols) + rows_local_to_ghost = local_to_ghost(rows) + GenericAssemblyStrategy( + identity, + identity, + row->iszero(rows_local_to_ghost[row]), + col->true + ) +end + # PSparseMatrix and PVector builders -struct PSparseMatrixBuilder{T,B} - local_matrix_type::Type{T} +struct DistributedArrayBuilder{T,B} + local_array_type::Type{T} strategy::B end +const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T<:AbstractMatrix,B} +const PVectorBuilder{T,B} = DistributedArrayBuilder{T<:AbstractVector,B} + function Algebra.get_array_type(::PSparseMatrixBuilder{Tv}) where Tv T = eltype(Tv) return PSparseMatrix{T} end -struct PVectorBuilder{T,B} - local_vector_type::Type{T} - strategy::B -end - function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv T = eltype(Tv) return PVector{T} @@ -53,6 +66,11 @@ Base.axes(a::DistributedCounter) = a.axes Base.axes(a::DistributedCounter,d::Integer) = a.axes[d] local_views(a::DistributedCounter) = a.counters +function local_views(a::DistributedCounter,axes::PRange...) + @check all(map(PArrays.matching_local_indices,axes,a.axes)) + return a.counters +end + const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B} Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop() @@ -83,6 +101,11 @@ Base.axes(a::DistributedAllocation) = a.axes Base.axes(a::DistributedAllocation,d::Integer) = a.axes[d] local_views(a::DistributedAllocation) = a.allocs +function local_views(a::DistributedAllocation,axes::PRange...) + @check all(map(PArrays.matching_local_indices,axes,a.axes)) + return a.allocs +end + function change_axes(a::DistributedAllocation{S,T,N},axes::NTuple{N,<:PRange}) where {S,T,N} indices = map(partition,axes) local_axes = map(indices...) do indices... @@ -151,6 +174,11 @@ function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled}) return A end +function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:SubAssembled}) + A, = create_from_nz_subassembled(a) + return A +end + # PVector assembly chain: # # 1 - nz_counter(PVectorBuilder) -> PVectorCounter @@ -166,7 +194,7 @@ function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::Tuple{<:PRange}) wh DistributedCounter(counters,axs,builder.strategy) end -function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled}) +function Arrays.nz_allocation(a::PVectorCounter{<:Union{LocallyAssembled,SubAssembled}}) allocs = map(nz_allocation,local_views(a)) DistributedAllocation(allocs,a.axes,a.strategy) end @@ -188,14 +216,14 @@ function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVecto @assert false end -function Algebra.create_from_nz(a::PVectorAllocation{<:LocallyAssembled,<:AbstractVector}) +function Algebra.create_from_nz(a::PVectorAllocation{<:Union{LocallyAssembled,SubAssembled}}) rows = remove_ghost(unpermute(axes(a,1))) values = local_views(a) b = rhs_callback(values,rows) return b end -function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S +function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled}) rows = collect_touched_ids(a) values = map(ai -> ai.values, local_views(a)) b = rhs_callback(values,rows) @@ -215,23 +243,19 @@ end # This is done by using the following specializations: function Arrays.nz_allocation( - a::PSparseMatrixCounter{<:Assembled}, - b::PVectorCounter{<:Assembled} + a::PSparseMatrixCounter, b::PVectorCounter ) - A = nz_allocation(a) # PSparseMatrixAllocation - B = nz_allocation(b) # PVectorAllocation{<:Assembled,<:TrackedArrayAllocation} - return A, B + return nz_allocation(a), nz_allocation(b) end function Algebra.create_from_nz( a::PSparseMatrixAllocation{<:LocallyAssembled}, - b::PVectorAllocation{<:LocallyAssembled,<:AbstractVector} + b::PVectorAllocation{<:LocallyAssembled} ) function callback(rows) - new_indices = partition(rows) values = local_views(b) b_fespace = PVector(values,partition(axes(b,1))) - locally_repartition(b_fespace,new_indices) + locally_repartition(b_fespace,rows) end A, B = create_from_nz_locally_assembled(a,callback) return A, B @@ -239,7 +263,7 @@ end function Algebra.create_from_nz( a::PSparseMatrixAllocation{<:Assembled}, - b::PVectorAllocation{<:Assembled,<:TrackedArrayAllocation} + b::PVectorAllocation{<:Assembled} ) function callback(rows) new_indices = collect_touched_ids(b) @@ -254,6 +278,19 @@ function Algebra.create_from_nz( return A, B end +function Algebra.create_from_nz( + a::PSparseMatrixAllocation{<:SubAssembled}, + b::PVectorAllocation{<:SubAssembled} +) + function callback(rows) + @check matching_local_indices(rows,axes(b,1)) + values = local_views(b) + PVector(values,partition(axes(b,1))) + end + A, B = create_from_nz_subassembled(a,callback) + return A, B +end + # Low-level assembly methods function create_from_nz_locally_assembled( @@ -321,18 +358,111 @@ end function create_from_nz_subassembled( a, callback::Function = rows -> nothing, - async_callback::Function = b -> empty_async_task ) rows = partition(axes(a,1)) cols = partition(axes(a,2)) b = callback(rows) - t2 = async_callback(b) assembled = false values = map(create_from_nz,local_views(a)) A = PSparseMatrix(values,rows,cols,assembled) - wait(t2) return A, b end + +# DistributedSparseMatrixAssemblers + +""" +""" +struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler + strategy::A + assems::B + matrix_builder::C + vector_builder::D + test_gids::E + trial_gids::F +end + +local_views(a::DistributedSparseMatrixAssembler) = a.assems + +FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids +FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids +FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder +FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder +FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy + +function FESpaces.SparseMatrixAssembler( + local_mat_type, + local_vec_type, + rows::PRange, + cols::PRange, + par_strategy=Assembled() +) + assems = map(partition(rows),partition(cols)) do rows,cols + FESpaces.GenericSparseMatrixAssembler( + SparseMatrixBuilder(local_mat_type), + ArrayBuilder(local_vec_type), + Base.OneTo(length(rows)), + Base.OneTo(length(cols)), + local_assembly_strategy(par_strategy,rows,cols) + ) + end + mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy) + vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy) + return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols) +end + +function FESpaces.SparseMatrixAssembler( + local_mat_type, + local_vec_type, + trial::DistributedFESpace, + test::DistributedFESpace, + par_strategy::FESpaces.AssemblyStrategy=Assembled(); + kwargs... +) + rows = get_free_dof_ids(test) + cols = get_free_dof_ids(trial) + SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...) +end + +function FESpaces.SparseMatrixAssembler( + trial::DistributedFESpace, + test::DistributedFESpace, + par_strategy::FESpaces.AssemblyStrategy=Assembled(); + kwargs... +) + Tv = getany(map(get_vector_type,local_views(trial))) + Tm = SparseMatrixCSC{eltype(Tv),Int} + SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...) +end + +function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) + rows, cols = get_rows(a), get_cols(a) + map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) +end + +function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) + rows, cols = get_rows(a), get_cols(a) + map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) +end + +function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) + rows = get_rows(a) + map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata) +end + +function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) + rows = get_rows(a) + map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata) +end + +function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) + rows, cols = get_rows(a), get_cols(a) + map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) +end + +function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) + rows, cols = get_rows(a), get_cols(a) + map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) +end diff --git a/src/FESpaces.jl b/src/FESpaces.jl index 1fb12343..4724789b 100644 --- a/src/FESpaces.jl +++ b/src/FESpaces.jl @@ -582,154 +582,6 @@ function FESpaces.collect_cell_matrix_and_vector( end end -""" -""" -struct DistributedSparseMatrixAssembler{A,B,C,D,E,F,G} <: SparseMatrixAssembler - strategy::A - assems::B - matrix_builder::C - vector_builder::D - test_gids::E - trial_gids::F - caches::G -end - -local_views(a::DistributedSparseMatrixAssembler) = a.assems - -FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids -FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids -FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder -FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder -FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy - -function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) - map(symbolic_loop_matrix!,local_views(A),local_views(a),matdata) -end - -function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) - map(numeric_loop_matrix!,local_views(A),local_views(a),matdata) -end - -function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) - rows = get_rows(a) - map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata) -end - -function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) - rows = get_rows(a) - map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata) -end - -function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) - rows = get_rows(a) - cols = get_cols(a) - Aviews=local_views(A,rows,cols) - bviews=local_views(b,rows) - aviews=local_views(a) - map(symbolic_loop_matrix_and_vector!,Aviews,bviews,aviews,data) -end - -function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) - rows = get_rows(a) - cols = get_cols(a) - map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) -end - -function FESpaces.assemble_matrix!(mat::PSparseMatrix,a::DistributedSparseMatrixAssembler,matdata) - @check !isnothing(a.caches) "Assembler is not reusable!" - @check haskey(a.caches,objectid(mat)) "Cache not found!" - cache = a.caches[objectid(mat)] - - LinearAlgebra.fillstored!(mat,zero(eltype(mat))) - - counter = nz_counter(get_matrix_builder(a),(get_rows(a),get_cols(a))) - map(local_views(counter),partition(mat)) do counter, mat - counter.nnz = nnz(mat) - end - allocs = nz_allocation(counter) - numeric_loop_matrix!(allocs,a,matdata) - - create_from_nz!(mat,allocs,cache) -end - -function create_from_nz!(A,a::DistributedAllocation{<:Assembled},cache) - _,_,V = get_allocations(a) - psparse!(A,V,cache) |> wait - return A -end - -# Parallel Assembly strategies - -function local_assembly_strategy(::Assembled,rows,cols) - DefaultAssemblyStrategy() -end - -# When using this one, make sure that you also loop over ghost cells. -# This is at your own risk. -function local_assembly_strategy(::LocallyAssembled,rows,cols) - rows_local_to_ghost = local_to_ghost(rows) - GenericAssemblyStrategy( - identity, - identity, - row->iszero(rows_local_to_ghost[row]), - col->true - ) -end - -# Assembler high level constructors -function FESpaces.SparseMatrixAssembler( - local_mat_type, - local_vec_type, - rows::PRange, - cols::PRange, - par_strategy=Assembled(); - reuse_caches=false -) - assems = map(partition(rows),partition(cols)) do rows,cols - local_strategy = local_assembly_strategy(par_strategy,rows,cols) - FESpaces.GenericSparseMatrixAssembler( - SparseMatrixBuilder(local_mat_type), - ArrayBuilder(local_vec_type), - Base.OneTo(length(rows)), - Base.OneTo(length(cols)), - local_strategy - ) - end - mat_builder = PSparseMatrixBuilder(local_mat_type,par_strategy) - vec_builder = PVectorBuilder(local_vec_type,par_strategy) - if reuse_caches - caches = Dict{UInt64,Any}() - else - caches = nothing - end - return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols,caches) -end - -function FESpaces.SparseMatrixAssembler( - local_mat_type, - local_vec_type, - trial::DistributedFESpace, - test::DistributedFESpace, - par_strategy::FESpaces.AssemblyStrategy=Assembled(); - kwargs... -) - rows = get_free_dof_ids(test) - cols = get_free_dof_ids(trial) - SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...) -end - -function FESpaces.SparseMatrixAssembler( - trial::DistributedFESpace, - test::DistributedFESpace, - par_strategy::FESpaces.AssemblyStrategy=Assembled(); - kwargs... -) - Tv = PartitionedArrays.getany(map(get_vector_type,local_views(trial))) - T = eltype(Tv) - Tm = SparseMatrixCSC{T,Int} - SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...) -end - # ZeroMean FESpace struct DistributedZeroMeanCache{A,B} dvol::A diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl index b139a547..df09bdb0 100644 --- a/src/PArraysExtras.jl +++ b/src/PArraysExtras.jl @@ -153,6 +153,25 @@ function _filter_ghost(indices,gids,owners) return new_ghost_to_global, new_ghost_to_owner end +# This function computes a mapping among the local identifiers of a and b +# for which the corresponding global identifiers are both in a and b. +# Note that the haskey check is necessary because in the general case +# there might be gids in b which are not present in a +function local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices) + a_lid_to_b_lid = fill(zero(Int32),local_length(a)) + + b_local_to_global = local_to_global(b) + a_global_to_local = global_to_local(a) + for b_lid in 1:local_length(b) + gid = b_local_to_global[b_lid] + a_lid = a_global_to_local[gid] + if !iszero(a_lid) + a_lid_to_b_lid[a_lid] = b_lid + end + end + a_lid_to_b_lid +end + # SubSparseMatrix extensions function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix) @@ -362,4 +381,3 @@ end function generate_subparts(parts::DebugArray,new_comm_size) DebugArray(LinearIndices((new_comm_size,))) end - diff --git a/test/parrays_update.jl b/test/parrays_update.jl index fd62332b..8516150e 100644 --- a/test/parrays_update.jl +++ b/test/parrays_update.jl @@ -3,6 +3,8 @@ using Gridap using PartitionedArrays using GridapDistributed +using Gridap.FESpaces, Gridap.Algebra + np = (1,2) ranks = with_debug() do distribute distribute(LinearIndices((prod(np),))) @@ -29,14 +31,46 @@ serial_dΩ = Measure(serial_Ω,2) dist_Ω = Triangulation(dist_model) dist_dΩ = Measure(dist_Ω,2) -serial_a1(u,v) = ∫(u⋅v)*serial_dΩ -serial_A1 = assemble_matrix(serial_a1,serial_V,serial_V) +dist_Ωg = Triangulation(GridapDistributed.with_ghost,dist_model) +dist_dΩg = Measure(dist_Ωg,2) + +serial_a(u,v) = ∫(u⋅v)*serial_dΩ +dist_a(u,v) = ∫(u⋅v)*dist_dΩ +dist_ag(u,v) = ∫(u⋅v)*dist_dΩg + +serial_A = assemble_matrix(serial_a,serial_V,serial_V) + +assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.Assembled()) +dist_A_AS = assemble_matrix(dist_a,assem,dist_V,dist_V) + +assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.LocallyAssembled()) +dist_A_LA = assemble_matrix(dist_ag,assem,dist_V,dist_V) + +assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.SubAssembled()) +dist_A_SA = assemble_matrix(dist_a,assem,dist_V,dist_V) + +all(centralize(dist_A_AS) - serial_A .< 1e-10) + +x_AS = prand(partition(axes(dist_A_AS,2))) +x_LA = GridapDistributed.change_ghost(x_AS,axes(dist_A_LA,2)) +x_SA = GridapDistributed.change_ghost(x_AS,axes(dist_A_SA,2)) + +norm(dist_A_AS*x_AS - dist_A_LA*x_LA) +norm(dist_A_AS*x_AS - dist_A_SA*x_SA) + +assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V) + +############################################################################################ + +A = deepcopy(dist_A_AS) + +t = assemble!(A;reuse=true) +_, cache = fetch(t) -dist_a1(u,v) = ∫(u⋅v)*dist_dΩ -assem = SparseMatrixAssembler(dist_V,dist_V) -dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V) -all(centralize(dist_A1) - serial_A1 .< 1e-10) +t2 = assemble!(A,cache) +wait(t2) +map(==, partition(A), partition(dist_A_AS)) ############################################################################################ From eef271e37b795fba23f11f4a9b64849f7c880319 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Mon, 31 Mar 2025 07:53:26 +1100 Subject: [PATCH 13/14] saving changes --- src/Assembly.jl | 114 +++---------------- src/FESpaces.jl | 248 ++++++++++++++++++++++++++++------------- test/parrays_update.jl | 59 +--------- 3 files changed, 185 insertions(+), 236 deletions(-) diff --git a/src/Assembly.jl b/src/Assembly.jl index 165d41c4..1e0127bd 100644 --- a/src/Assembly.jl +++ b/src/Assembly.jl @@ -22,16 +22,24 @@ end # PSparseMatrix and PVector builders -struct DistributedArrayBuilder{T,B} +struct DistributedArrayBuilder{T,N,B} local_array_type::Type{T} strategy::B + function DistributedArrayBuilder( + local_array_type::Type{T}, + strategy::AssemblyStrategy + ) where T + N = ndims(T) + B = typeof(strategy) + new{T,N,B}(local_array_type,strategy) + end end -const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T<:AbstractMatrix,B} -const PVectorBuilder{T,B} = DistributedArrayBuilder{T<:AbstractVector,B} +const PVectorBuilder{T,B} = DistributedArrayBuilder{T,1,B} +const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T,2,B} -function Algebra.get_array_type(::PSparseMatrixBuilder{Tv}) where Tv - T = eltype(Tv) +function Algebra.get_array_type(::PSparseMatrixBuilder{Tm}) where Tm + T = eltype(Tm) return PSparseMatrix{T} end @@ -370,99 +378,3 @@ function create_from_nz_subassembled( return A, b end - -# DistributedSparseMatrixAssemblers - -""" -""" -struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler - strategy::A - assems::B - matrix_builder::C - vector_builder::D - test_gids::E - trial_gids::F -end - -local_views(a::DistributedSparseMatrixAssembler) = a.assems - -FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids -FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids -FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder -FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder -FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy - -function FESpaces.SparseMatrixAssembler( - local_mat_type, - local_vec_type, - rows::PRange, - cols::PRange, - par_strategy=Assembled() -) - assems = map(partition(rows),partition(cols)) do rows,cols - FESpaces.GenericSparseMatrixAssembler( - SparseMatrixBuilder(local_mat_type), - ArrayBuilder(local_vec_type), - Base.OneTo(length(rows)), - Base.OneTo(length(cols)), - local_assembly_strategy(par_strategy,rows,cols) - ) - end - mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy) - vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy) - return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols) -end - -function FESpaces.SparseMatrixAssembler( - local_mat_type, - local_vec_type, - trial::DistributedFESpace, - test::DistributedFESpace, - par_strategy::FESpaces.AssemblyStrategy=Assembled(); - kwargs... -) - rows = get_free_dof_ids(test) - cols = get_free_dof_ids(trial) - SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...) -end - -function FESpaces.SparseMatrixAssembler( - trial::DistributedFESpace, - test::DistributedFESpace, - par_strategy::FESpaces.AssemblyStrategy=Assembled(); - kwargs... -) - Tv = getany(map(get_vector_type,local_views(trial))) - Tm = SparseMatrixCSC{eltype(Tv),Int} - SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...) -end - -function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) - rows, cols = get_rows(a), get_cols(a) - map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) -end - -function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) - rows, cols = get_rows(a), get_cols(a) - map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) -end - -function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) - rows = get_rows(a) - map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata) -end - -function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) - rows = get_rows(a) - map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata) -end - -function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) - rows, cols = get_rows(a), get_cols(a) - map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) -end - -function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) - rows, cols = get_rows(a), get_cols(a) - map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) -end diff --git a/src/FESpaces.jl b/src/FESpaces.jl index 4724789b..3121e118 100644 --- a/src/FESpaces.jl +++ b/src/FESpaces.jl @@ -505,83 +505,6 @@ function _add_distributed_constraint(F::DistributedFESpace,order::Integer,constr V end -# Assembly - -function FESpaces.collect_cell_matrix( - trial::DistributedFESpace, - test::DistributedFESpace, - a::DistributedDomainContribution -) - map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a)) -end - -function FESpaces.collect_cell_vector( - test::DistributedFESpace, a::DistributedDomainContribution -) - map(collect_cell_vector,local_views(test),local_views(a)) -end - -function FESpaces.collect_cell_matrix_and_vector( - trial::DistributedFESpace, - test::DistributedFESpace, - biform::DistributedDomainContribution, - liform::DistributedDomainContribution -) - map(collect_cell_matrix_and_vector, - local_views(trial), - local_views(test), - local_views(biform), - local_views(liform) - ) -end - -function FESpaces.collect_cell_matrix_and_vector( - trial::DistributedFESpace, - test::DistributedFESpace, - biform::DistributedDomainContribution, - liform::DistributedDomainContribution, - uhd -) - map(collect_cell_matrix_and_vector, - local_views(trial), - local_views(test), - local_views(biform), - local_views(liform), - local_views(uhd) - ) -end - -function FESpaces.collect_cell_vector( - test::DistributedFESpace,l::Number -) - map(local_views(test)) do s - collect_cell_vector(s,l) - end -end - -function FESpaces.collect_cell_matrix_and_vector( - trial::DistributedFESpace, - test::DistributedFESpace, - mat::DistributedDomainContribution, - l::Number -) - map(local_views(trial),local_views(test),local_views(mat)) do u,v,m - collect_cell_matrix_and_vector(u,v,m,l) - end -end - -function FESpaces.collect_cell_matrix_and_vector( - trial::DistributedFESpace, - test::DistributedFESpace, - mat::DistributedDomainContribution, - l::Number, - uhd -) - map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f - collect_cell_matrix_and_vector(u,v,m,l,f) - end -end - # ZeroMean FESpace struct DistributedZeroMeanCache{A,B} dvol::A @@ -741,3 +664,174 @@ function FESpaces.ConstantFESpace( vector_type = _find_vector_type(spaces,gids) return DistributedSingleFieldFESpace(spaces,gids,trian,vector_type) end + +# Assembly + +""" +""" +struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler + strategy::A + assems::B + matrix_builder::C + vector_builder::D + test_gids::E + trial_gids::F +end + +local_views(a::DistributedSparseMatrixAssembler) = a.assems + +FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids +FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids +FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder +FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder +FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy + +function FESpaces.SparseMatrixAssembler( + local_mat_type, + local_vec_type, + rows::PRange, + cols::PRange, + par_strategy=Assembled() +) + assems = map(partition(rows),partition(cols)) do rows,cols + FESpaces.GenericSparseMatrixAssembler( + SparseMatrixBuilder(local_mat_type), + ArrayBuilder(local_vec_type), + Base.OneTo(length(rows)), + Base.OneTo(length(cols)), + local_assembly_strategy(par_strategy,rows,cols) + ) + end + mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy) + vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy) + return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols) +end + +function FESpaces.SparseMatrixAssembler( + local_mat_type, + local_vec_type, + trial::DistributedFESpace, + test::DistributedFESpace, + par_strategy::FESpaces.AssemblyStrategy=Assembled(); + kwargs... +) + rows = get_free_dof_ids(test) + cols = get_free_dof_ids(trial) + SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...) +end + +function FESpaces.SparseMatrixAssembler( + trial::DistributedFESpace, + test::DistributedFESpace, + par_strategy::FESpaces.AssemblyStrategy=Assembled(); + kwargs... +) + Tv = getany(map(get_vector_type,local_views(trial))) + Tm = SparseMatrixCSC{eltype(Tv),Int} + SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...) +end + +function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) + rows, cols = get_rows(a), get_cols(a) + map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) +end + +function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) + rows, cols = get_rows(a), get_cols(a) + map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) +end + +function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) + rows = get_rows(a) + map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata) +end + +function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) + rows = get_rows(a) + map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata) +end + +function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) + rows, cols = get_rows(a), get_cols(a) + map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) +end + +function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) + rows, cols = get_rows(a), get_cols(a) + map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) +end + +function FESpaces.collect_cell_matrix( + trial::DistributedFESpace, + test::DistributedFESpace, + a::DistributedDomainContribution +) + map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a)) +end + +function FESpaces.collect_cell_vector( + test::DistributedFESpace, a::DistributedDomainContribution +) + map(collect_cell_vector,local_views(test),local_views(a)) +end + +function FESpaces.collect_cell_matrix_and_vector( + trial::DistributedFESpace, + test::DistributedFESpace, + biform::DistributedDomainContribution, + liform::DistributedDomainContribution +) + map(collect_cell_matrix_and_vector, + local_views(trial), + local_views(test), + local_views(biform), + local_views(liform) + ) +end + +function FESpaces.collect_cell_matrix_and_vector( + trial::DistributedFESpace, + test::DistributedFESpace, + biform::DistributedDomainContribution, + liform::DistributedDomainContribution, + uhd +) + map(collect_cell_matrix_and_vector, + local_views(trial), + local_views(test), + local_views(biform), + local_views(liform), + local_views(uhd) + ) +end + +function FESpaces.collect_cell_vector( + test::DistributedFESpace,l::Number +) + map(local_views(test)) do s + collect_cell_vector(s,l) + end +end + +function FESpaces.collect_cell_matrix_and_vector( + trial::DistributedFESpace, + test::DistributedFESpace, + mat::DistributedDomainContribution, + l::Number +) + map(local_views(trial),local_views(test),local_views(mat)) do u,v,m + collect_cell_matrix_and_vector(u,v,m,l) + end +end + +function FESpaces.collect_cell_matrix_and_vector( + trial::DistributedFESpace, + test::DistributedFESpace, + mat::DistributedDomainContribution, + l::Number, + uhd +) + map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f + collect_cell_matrix_and_vector(u,v,m,l,f) + end +end diff --git a/test/parrays_update.jl b/test/parrays_update.jl index 8516150e..961cbfcd 100644 --- a/test/parrays_update.jl +++ b/test/parrays_update.jl @@ -59,64 +59,7 @@ norm(dist_A_AS*x_AS - dist_A_LA*x_LA) norm(dist_A_AS*x_AS - dist_A_SA*x_SA) assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V) +norm(dist_A_AS*x_AS - dist_A_SA*x_SA) ############################################################################################ -A = deepcopy(dist_A_AS) - -t = assemble!(A;reuse=true) -_, cache = fetch(t) - -t2 = assemble!(A,cache) -wait(t2) - -map(==, partition(A), partition(dist_A_AS)) - -############################################################################################ - -function PartitionedArrays.precompute_nzindex(A,I,J;skip=false) - K = zeros(Int32,length(I)) - for (p,(i,j)) in enumerate(zip(I,J)) - if !skip && (i < 1 || j < 1) - continue - end - K[p] = nzindex(A,i,j) - end - K -end - -############################################################################################ - -ids = cids -indices = partition(ids) - -n_own = own_length(cids) -ghost2global = ghost_to_global(cids) -ghost2owner = ghost_to_owner(cids) - -first_gid = scan(+,n_own,type=:exclusive,init=one(eltype(n_own))) -n_global = reduce(+,n_own) -n_parts = length(ranks) -indices2 = map(ranks,n_own,first_gid,ghost2global,ghost2owner) do rank,n_own,start,ghost2global,ghost2owner - p = CartesianIndex((rank,)) - np = (n_parts,) - n = (n_global,) - ranges = ((1:n_own).+(start-1),) - ghost = GhostIndices(n_global,ghost2global,ghost2owner) - - PartitionedArrays.LocalIndicesWithVariableBlockSize(p,np,n,ranges,ghost) -end - -map(indices,indices2) do indices, indices2 - ghost2local = ghost_to_local(indices) - own2local = own_to_local(indices) - - n_own = own_length(indices) - n_ghost = ghost_length(indices) - - #perm = vcat(own2local,ghost2local) - perm = fill(0,local_length(indices)) - perm[own2local] .= 1:n_own - perm[ghost2local] .= (n_own+1):(n_ghost+n_own) - permute_indices(indices2,perm) -end From cb9de8ec348a09d14670b606e5dda3918177c71e Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Mon, 31 Mar 2025 10:13:45 +1100 Subject: [PATCH 14/14] Bugfixes --- src/Assembly.jl | 19 ++++++++++--------- src/FESpaces.jl | 3 +-- src/PArraysExtras.jl | 19 ++++++++++++++++++- test/PeriodicBCsTests.jl | 2 ++ test/parrays_update.jl | 1 - 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/src/Assembly.jl b/src/Assembly.jl index 1e0127bd..41673c5e 100644 --- a/src/Assembly.jl +++ b/src/Assembly.jl @@ -133,18 +133,19 @@ function get_allocations(a::PSparseMatrixAllocation{S,<:Algebra.AllocationCOO}) return I,J,V end -function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation}) - map(local_views(a),partition(axes(a,1))) do a, ids +function collect_touched_ids(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S + touched_ids = map(local_views(a),partition(axes(a,1))) do a, ids + n_global = global_length(ids) rows = remove_ghost(unpermute(ids)) ghost_lids = ghost_to_local(ids) - ghost_owners = ghost_to_owner(ids) - touched_ghost_lids = filter(lid -> a.touched[lid], ghost_lids) - touched_ghost_owners = collect(ghost_owners[touched_ghost_lids]) + touched_ghost_lids = collect(Int,filter(lid -> a.touched[lid], ghost_lids)) + touched_ghost_owners = local_to_owner(ids)[touched_ghost_lids] touched_ghost_gids = to_global!(touched_ghost_lids, ids) ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners) - return replace_ghost(rows,ghost) + replace_ghost(rows,ghost) end + return touched_ids end # PSparseMatrix assembly chain @@ -193,13 +194,13 @@ end # 2 - nz_allocation(PVectorCounter) -> PVectorAllocation # 3 - create_from_nz(PVectorAllocation) -> PVector -function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::Tuple{<:PRange}) where VT +function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::NTuple{1,<:PRange}) where VT rows, = axs counters = map(partition(rows)) do rows axs = (Base.OneTo(local_length(rows)),) nz_counter(ArrayBuilder(VT),axs) end - DistributedCounter(counters,axs,builder.strategy) + DistributedCounter(counters,(rows,),builder.strategy) end function Arrays.nz_allocation(a::PVectorCounter{<:Union{LocallyAssembled,SubAssembled}}) @@ -291,7 +292,7 @@ function Algebra.create_from_nz( b::PVectorAllocation{<:SubAssembled} ) function callback(rows) - @check matching_local_indices(rows,axes(b,1)) + @check PArrays.matching_local_indices(PRange(rows),axes(b,1)) values = local_views(b) PVector(values,partition(axes(b,1))) end diff --git a/src/FESpaces.jl b/src/FESpaces.jl index 3121e118..f68b7470 100644 --- a/src/FESpaces.jl +++ b/src/FESpaces.jl @@ -168,8 +168,7 @@ function generate_gids( cell_owner = cell_local_to_owner[cell] p = cell_to_gdofs.ptrs[cell]-1 for (i,ldof) in enumerate(ldofs) - dof_owner = ldof_to_owner[ldof] - if ldof > 0 && (dof_owner == cell_owner) + if ldof > 0 && isequal(ldof_to_owner[ldof],cell_owner) ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p] end end diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl index df09bdb0..302944a6 100644 --- a/src/PArraysExtras.jl +++ b/src/PArraysExtras.jl @@ -3,7 +3,7 @@ permuted_variable_partition(n_own,gids,owners; n_global, start) Create indices which are a permuted version of a variable_partition. -The advantage of this wrt the `LocalIndices` type, is that we can compute +The advantage of this w.r.t. the `LocalIndices` type, is that we can compute dof owners with minimal communications, since we only need the size of the blocks. NOTE: This is the type for our FESpace dof_ids. @@ -153,6 +153,23 @@ function _filter_ghost(indices,gids,owners) return new_ghost_to_global, new_ghost_to_owner end +# function PArrays.remove_ghost(indices::PermutedLocalIndices) +# n_global = global_length(indices) +# own = OwnIndices(n_global,part_id(indices),own_to_global(indices)) +# ghost = GhostIndices(n_global,Int[],Int32[]) +# OwnAndGhostIndices(own,ghost) +# end + +function PArrays.remove_ghost(indices::PermutedLocalIndices) + perm = indices.perm + own_to_local = indices.own_to_local + display(own_to_local) + println(issorted(own_to_local)) + display(perm) + println("-------------------------------") + remove_ghost(indices.indices) +end + # This function computes a mapping among the local identifiers of a and b # for which the corresponding global identifiers are both in a and b. # Note that the haskey check is necessary because in the general case diff --git a/test/PeriodicBCsTests.jl b/test/PeriodicBCsTests.jl index 6177752d..247cb6fc 100644 --- a/test/PeriodicBCsTests.jl +++ b/test/PeriodicBCsTests.jl @@ -35,4 +35,6 @@ function main(distribute,parts) end +main(DebugArray,(2,2)) + end # module diff --git a/test/parrays_update.jl b/test/parrays_update.jl index 961cbfcd..ec106730 100644 --- a/test/parrays_update.jl +++ b/test/parrays_update.jl @@ -62,4 +62,3 @@ assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V) norm(dist_A_AS*x_AS - dist_A_SA*x_SA) ############################################################################################ -