From fb0de46755e8776ac8dd965b61c6934844deca20 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Fri, 30 Aug 2024 12:26:12 +1000
Subject: [PATCH 01/14] Updated to PartitionedArrays v0.5

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 1dc2bd64..494e3732 100644
--- a/Project.toml
+++ b/Project.toml
@@ -20,7 +20,7 @@ FillArrays = "0.8.4,1"
 Gridap = "0.18.3"
 LinearAlgebra = "1.3"
 MPI = "0.16, 0.17, 0.18, 0.19, 0.20"
-PartitionedArrays = "0.3.3"
+PartitionedArrays = "0.5"
 SparseArrays = "1.3"
 SparseMatricesCSR = "0.6.6"
 WriteVTK = "1.12.0"

From 9678cbd448f874513af8f99af1ee53089fbaa390 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Fri, 30 Aug 2024 12:26:34 +1000
Subject: [PATCH 02/14] Assembly working, no reuse

---
 src/Algebra.jl                      | 174 ++++++++++++++--------------
 src/BlockPartitionedArrays.jl       |   4 +-
 src/FESpaces.jl                     |   4 +-
 test/BlockPartitionedArraysTests.jl |   2 +-
 test/parrays_update.jl              |  36 ++++++
 5 files changed, 128 insertions(+), 92 deletions(-)
 create mode 100644 test/parrays_update.jl

diff --git a/src/Algebra.jl b/src/Algebra.jl
index 57bcbac3..11af9149 100644
--- a/src/Algebra.jl
+++ b/src/Algebra.jl
@@ -62,8 +62,8 @@ function Algebra.axpy_entries!(
 # We should definitely check here that the index partitions are the same. 
 # However: Because the different matrices are assembled separately, the objects are not the 
 # same (i.e can't use ===). Checking the index partitions would then be costly...
-  @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
-  @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
   map(partition(A),partition(B)) do A, B
     Algebra.axpy_entries!(α,A,B;check)
   end
@@ -394,13 +394,14 @@ struct PSparseMatrixBuilderCOO{T,B}
 end
 
 function Algebra.nz_counter(
-  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}) where A
-  test_dofs_gids_prange, trial_dofs_gids_prange = axs
-  counters = map(partition(test_dofs_gids_prange),partition(trial_dofs_gids_prange)) do r,c
+  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}
+) where A
+  rows, cols = axs # test ids, trial ids
+  counters = map(partition(rows),partition(cols)) do r,c
     axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c)))
     Algebra.CounterCOO{A}(axs)
   end
-  DistributedCounterCOO(builder.par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange)
+  DistributedCounterCOO(builder.par_strategy,counters,rows,cols)
 end
 
 function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv
@@ -414,56 +415,58 @@ end
 struct DistributedCounterCOO{A,B,C,D} <: GridapType
   par_strategy::A
   counters::B
-  test_dofs_gids_prange::C
-  trial_dofs_gids_prange::D
+  test_gids::C
+  trial_gids::D
   function DistributedCounterCOO(
     par_strategy,
     counters::AbstractArray{<:Algebra.CounterCOO},
-    test_dofs_gids_prange::PRange,
-    trial_dofs_gids_prange::PRange)
+    test_gids::PRange,
+    trial_gids::PRange)
     A = typeof(par_strategy)
     B = typeof(counters)
-    C = typeof(test_dofs_gids_prange)
-    D = typeof(trial_dofs_gids_prange)
-    new{A,B,C,D}(par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange)
+    C = typeof(test_gids)
+    D = typeof(trial_gids)
+    new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids)
   end
 end
 
 function local_views(a::DistributedCounterCOO)
-  a.counters
+  return a.counters
 end
 
-function local_views(a::DistributedCounterCOO,test_dofs_gids_prange,trial_dofs_gids_prange)
-  @check test_dofs_gids_prange === a.test_dofs_gids_prange
-  @check trial_dofs_gids_prange === a.trial_dofs_gids_prange
-  a.counters
+function local_views(a::DistributedCounterCOO,test_gids,trial_gids)
+  @check PArrays.matching_local_indices(test_gids,a.test_gids)
+  @check PArrays.matching_local_indices(trial_gids,a.trial_gids)
+  return a.counters
 end
 
 function Algebra.nz_allocation(a::DistributedCounterCOO)
   allocs = map(nz_allocation,a.counters)
-  DistributedAllocationCOO(a.par_strategy,allocs,a.test_dofs_gids_prange,a.trial_dofs_gids_prange)
+  DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids)
 end
 
 struct DistributedAllocationCOO{A,B,C,D} <: GridapType
   par_strategy::A
   allocs::B
-  test_dofs_gids_prange::C
-  trial_dofs_gids_prange::D
+  test_gids::C
+  trial_gids::D
   function DistributedAllocationCOO(
     par_strategy,
     allocs::AbstractArray{<:Algebra.AllocationCOO},
-    test_dofs_gids_prange::PRange,
-    trial_dofs_gids_prange::PRange)
+    test_gids::PRange,
+    trial_gids::PRange)
     A = typeof(par_strategy)
     B = typeof(allocs)
-    C = typeof(test_dofs_gids_prange)
-    D = typeof(trial_dofs_gids_prange)
-    new{A,B,C,D}(par_strategy,allocs,test_dofs_gids_prange,trial_dofs_gids_prange)
+    C = typeof(test_gids)
+    D = typeof(trial_gids)
+    new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids)
   end
 end
 
-function change_axes(a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
-                     axes::Tuple{<:PRange,<:PRange}) where {A,B}
+function change_axes(
+  a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
+  axes::Tuple{<:PRange,<:PRange}
+) where {A,B}
   local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols
     (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols)))
   end
@@ -471,8 +474,10 @@ function change_axes(a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
   DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2])
 end
 
-function change_axes(a::MatrixBlock{<:DistributedAllocationCOO},
-                     axes::Tuple{<:Vector,<:Vector})
+function change_axes(
+  a::MatrixBlock{<:DistributedAllocationCOO},
+  axes::Tuple{<:Vector,<:Vector}
+)
   block_ids  = CartesianIndices(a.array)
   rows, cols = axes
   array = map(block_ids) do I
@@ -485,9 +490,9 @@ function local_views(a::DistributedAllocationCOO)
   a.allocs
 end
 
-function local_views(a::DistributedAllocationCOO,test_dofs_gids_prange,trial_dofs_gids_prange)
-  @check test_dofs_gids_prange === a.test_dofs_gids_prange
-  @check trial_dofs_gids_prange === a.trial_dofs_gids_prange
+function local_views(a::DistributedAllocationCOO,test_gids,trial_gids)
+  @check test_gids === a.test_gids
+  @check trial_gids === a.trial_gids
   a.allocs
 end
 
@@ -508,8 +513,8 @@ function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO})
   return tuple_of_array_of_parrays
 end
 
-get_test_gids(a::DistributedAllocationCOO)  = a.test_dofs_gids_prange
-get_trial_gids(a::DistributedAllocationCOO) = a.trial_dofs_gids_prange
+get_test_gids(a::DistributedAllocationCOO)  = a.test_gids
+get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids
 get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO})  = map(get_test_gids,diag(a.array))
 get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array))
 
@@ -536,31 +541,24 @@ function _fa_create_from_nz_with_callback(callback,a)
 
   # Recover some data
   I,J,V = get_allocations(a)
-  test_dofs_gids_prange  = get_test_gids(a)
-  trial_dofs_gids_prange = get_trial_gids(a)
+  test_gids  = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
 
-  rows = _setup_prange(test_dofs_gids_prange,I;ghost=false,ax=:rows)
+  rows = _setup_prange(test_gids,I;ghost=false,ax=:rows)
   b = callback(rows)
 
   # convert I and J to global dof ids
-  to_global_indices!(I,test_dofs_gids_prange;ax=:rows)
-  to_global_indices!(J,trial_dofs_gids_prange;ax=:cols)
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
 
   # Create the range for cols
-  cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols)
+  cols = _setup_prange(trial_gids,J;ax=:cols)
 
   # Convert again I,J to local numeration
   to_local_indices!(I,rows;ax=:rows)
   to_local_indices!(J,cols;ax=:cols)
 
-  # Adjust local matrix size to linear system's index sets
-  asys = change_axes(a,(rows,cols))
-
-  # Compress local portions
-  values = map(create_from_nz,local_views(asys))
-
-  # Finally build the matrix
-  A = _setup_matrix(values,rows,cols)
+  A = _setup_matrix(I,J,V,rows,cols)
   return A, b
 end
 
@@ -579,19 +577,19 @@ end
 function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
   # Recover some data
   I,J,V = get_allocations(a)
-  test_dofs_gids_prange = get_test_gids(a)
-  trial_dofs_gids_prange = get_trial_gids(a)
+  test_gids = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
 
   # convert I and J to global dof ids
-  to_global_indices!(I,test_dofs_gids_prange;ax=:rows)
-  to_global_indices!(J,trial_dofs_gids_prange;ax=:cols)
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
 
   # Create the Prange for the rows
-  rows = _setup_prange(test_dofs_gids_prange,I;ax=:rows)
+  rows = _setup_prange(test_gids,I;ax=:rows)
   
   # Move (I,J,V) triplets to the owner process of each row I.
   # The triplets are accompanyed which Jo which is the process column owner
-  Jo = get_gid_owners(J,trial_dofs_gids_prange;ax=:cols)
+  Jo = get_gid_owners(J,trial_gids;ax=:cols)
   t  = _assemble_coo!(I,J,V,rows;owners=Jo)
 
   # Here we can overlap computations
@@ -606,7 +604,7 @@ function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
   wait(t)
 
   # Create the Prange for the cols
-  cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols,owners=Jo)
+  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
 
   # Overlap rhs communications with CSC compression
   t2 = async_callback(b)
@@ -615,19 +613,13 @@ function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
   to_local_indices!(I,rows;ax=:rows)
   to_local_indices!(J,cols;ax=:cols)
 
-  # Adjust local matrix size to linear system's index sets
-  asys = change_axes(a,(rows,cols))
-
-  # Compress the local matrices
-  values = map(create_from_nz,local_views(asys))
+  A = _setup_matrix(I,J,V,rows,cols)
 
   # Wait the transfer to finish
   if !isa(t2,Nothing)
     wait(t2)
   end
-
-  # Finally build the matrix
-  A = _setup_matrix(values,rows,cols)
+  
   return A, b
 end
 
@@ -657,7 +649,7 @@ end
 struct PVectorCounter{A,B,C}
   par_strategy::A
   counters::B
-  test_dofs_gids_prange::C
+  test_gids::C
 end
 
 Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
@@ -667,12 +659,12 @@ function local_views(a::PVectorCounter)
 end
 
 function local_views(a::PVectorCounter,rows)
-  @check rows === a.test_dofs_gids_prange
+  @check rows === a.test_gids
   a.counters
 end
 
 function Arrays.nz_allocation(a::PVectorCounter{<:FullyAssembledRows})
-  dofs = a.test_dofs_gids_prange
+  dofs = a.test_gids
   values = map(nz_allocation,a.counters)
   PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs)
 end
@@ -680,7 +672,7 @@ end
 struct PVectorAllocationTrackOnlyValues{A,B,C}
   par_strategy::A
   values::B
-  test_dofs_gids_prange::C
+  test_gids::C
 end
 
 function local_views(a::PVectorAllocationTrackOnlyValues)
@@ -688,12 +680,12 @@ function local_views(a::PVectorAllocationTrackOnlyValues)
 end
 
 function local_views(a::PVectorAllocationTrackOnlyValues,rows)
-  @check rows === a.test_dofs_gids_prange
+  @check rows === a.test_gids
   a.values
 end
 
 function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows})
-  rows = _setup_prange_without_ghosts(a.test_dofs_gids_prange)
+  rows = _setup_prange_without_ghosts(a.test_gids)
   _rhs_callback(a,rows)
 end
 
@@ -707,7 +699,7 @@ function _rhs_callback(row_partitioned_vector_partition,rows)
   # The ghost values in row_partitioned_vector_partition are 
   # aligned with the FESpace but not with the ghost values in the rows of A
   b_fespace = PVector(row_partitioned_vector_partition.values,
-                      partition(row_partitioned_vector_partition.test_dofs_gids_prange))
+                      partition(row_partitioned_vector_partition.test_gids))
 
   # This one is aligned with the rows of A
   b = similar(b_fespace,eltype(b_fespace),(rows,))
@@ -758,7 +750,7 @@ end
 struct PVectorAllocationTrackTouchedAndValues{A,B,C}
   allocations::A
   values::B
-  test_dofs_gids_prange::C
+  test_gids::C
 end
 
 function Algebra.create_from_nz(
@@ -787,7 +779,7 @@ Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Grida
 
 
 function local_views(a::PVectorAllocationTrackTouchedAndValues,rows)
-  @check rows === a.test_dofs_gids_prange
+  @check rows === a.test_gids
   a.allocations
 end
 
@@ -835,7 +827,7 @@ end
 function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows},
                               b::PVectorCounter{<:SubAssembledRows})
   A      = nz_allocation(a)
-  dofs   = b.test_dofs_gids_prange
+  dofs   = b.test_gids
   values = map(nz_allocation,b.counters)
   touched,allocations=_setup_touched_and_allocations_arrays(values)
   B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
@@ -843,7 +835,7 @@ function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows},
 end
 
 function Arrays.nz_allocation(a::PVectorCounter{<:SubAssembledRows})
-  dofs = a.test_dofs_gids_prange
+  dofs = a.test_gids
   values = map(nz_allocation,a.counters)
   touched,allocations=_setup_touched_and_allocations_arrays(values)
   return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
@@ -857,7 +849,7 @@ function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedA
 
   # Find the ghost rows
   allocations = local_views(a.allocations)
-  indices = partition(a.test_dofs_gids_prange)
+  indices = partition(a.test_gids)
   I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices
     dofs_lids_touched = findall(allocation.touched)
     loc_to_gho = local_to_ghost(indices)
@@ -877,7 +869,7 @@ function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedA
   ghost_to_global, ghost_to_owner = map(
     find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays
 
-  ngids = length(a.test_dofs_gids_prange)
+  ngids = length(a.test_gids)
   _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner)
 end
 
@@ -996,15 +988,23 @@ end
 
 # _setup_matrix : local matrices + global PRanges -> Global matrix
 
-function _setup_matrix(values,rows::PRange,cols::PRange)
-  return PSparseMatrix(values,partition(rows),partition(cols))
-end
-
-function _setup_matrix(values,rows::Vector{<:PRange},cols::Vector{<:PRange})
-  block_ids  = CartesianIndices((length(rows),length(cols)))
-  block_mats = map(block_ids) do I
-    block_values = map(v -> blocks(v)[I],values)
-    return _setup_matrix(block_values,rows[I[1]],cols[I[2]])
+function _setup_matrix(I,J,V,rows::PRange,cols::PRange)
+  assembled_rows = map(PArrays.remove_ghost,partition(rows))
+  psparse(
+    I,J,V,assembled_rows,partition(cols);
+    split_format=true,
+    assembled=true,
+    assemble=true,
+    indices=:local,
+    reuse=false,
+  ) |> fetch
+end
+
+function _setup_matrix(I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange})
+  block_ids  = CartesianIndices(I)
+  block_mats = map(block_ids) do id
+    i = id[1]; j = id[2];
+    _setup_matrix(I[i,j],J[i,j],V[i,j],rows[i],cols[j])
   end
   return mortar(block_mats)
 end
@@ -1116,7 +1116,7 @@ function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown)
   end
 end
 
-# dofs_gids_prange can be either test_dofs_gids_prange or trial_dofs_gids_prange
+# dofs_gids_prange can be either test_gids or trial_gids
 # In the former case, gids is a vector of global test dof identifiers, while in the 
 # latter, a vector of global trial dof identifiers
 function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...)
diff --git a/src/BlockPartitionedArrays.jl b/src/BlockPartitionedArrays.jl
index 41e3f6dc..942a7c28 100644
--- a/src/BlockPartitionedArrays.jl
+++ b/src/BlockPartitionedArrays.jl
@@ -271,8 +271,8 @@ function PartitionedArrays.partition(a::BlockPArray)
   return map(mortar,vals)
 end
 
-function PartitionedArrays.to_trivial_partition(a::BlockPArray)
-  vals = map(PartitionedArrays.to_trivial_partition,blocks(a))
+function PartitionedArrays.centralize(a::BlockPArray)
+  vals = map(PartitionedArrays.centralize,blocks(a))
   return mortar(vals)
 end
 
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index c2b09475..844ec9e6 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -118,7 +118,7 @@ end
 
 function fetch_vector_ghost_values_cache(vector_partition,partition)
   cache = PArrays.p_vector_cache(vector_partition,partition)
-  map(reverse,cache)
+  reverse(cache)
 end
 
 function fetch_vector_ghost_values!(vector_partition,cache)
@@ -163,7 +163,7 @@ function generate_gids(
   #         swapped in the AssemblyCache of partition(cell_range)
 
   # Exchange the dof owners
-  cache_fetch=fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range))
+  cache_fetch = fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range))
   fetch_vector_ghost_values!(cell_ldofs_to_part,cache_fetch) |> wait
   
   cell_wise_to_dof_wise!(ldof_to_owner,
diff --git a/test/BlockPartitionedArraysTests.jl b/test/BlockPartitionedArraysTests.jl
index 3e810085..5ec6579c 100644
--- a/test/BlockPartitionedArraysTests.jl
+++ b/test/BlockPartitionedArraysTests.jl
@@ -76,7 +76,7 @@ t = assemble!(__v)
 assemble!(__m) |> wait
 fetch(t);
 
-PartitionedArrays.to_trivial_partition(m)
+PartitionedArrays.centralize(m)
 
 partition(v)
 partition(m)
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
new file mode 100644
index 00000000..951a68d7
--- /dev/null
+++ b/test/parrays_update.jl
@@ -0,0 +1,36 @@
+
+using Gridap
+using PartitionedArrays
+using GridapDistributed
+
+np = (1,2)
+ranks = with_debug() do distribute
+  distribute(LinearIndices((prod(np),)))
+end
+
+nc = (2,4)
+serial_model = CartesianDiscreteModel((0,1,0,1),nc)
+dist_model = CartesianDiscreteModel(ranks,np,(0,1,0,1),nc)
+
+cids = get_cell_gids(dist_model)
+
+reffe = ReferenceFE(lagrangian,Float64,1)
+serial_V = TestFESpace(serial_model,reffe)
+dist_V = TestFESpace(dist_model,reffe)
+
+serial_ids = get_free_dof_ids(serial_V)
+dist_ids = get_free_dof_ids(dist_V)
+
+serial_Ω = Triangulation(serial_model)
+serial_dΩ = Measure(serial_Ω,2)
+
+dist_Ω = Triangulation(dist_model)
+dist_dΩ = Measure(dist_Ω,2)
+
+serial_a1(u,v) = ∫(u⋅v)*serial_dΩ
+serial_A1 = assemble_matrix(serial_a1,serial_V,serial_V)
+
+dist_a1(u,v) = ∫(u⋅v)*dist_dΩ
+dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V)
+all(centralize(dist_A1) - serial_A1 .< 1e-10)
+

From e17e85ce107030fcfdc3fafd8ae386a46176eb8f Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Mon, 2 Sep 2024 10:37:17 +1000
Subject: [PATCH 03/14] FESpace gids are now permuted
 LocalIndicesWithVariableBlockSize

---
 src/DistributedUtils.jl  |  44 +++++++
 src/FESpaces.jl          | 240 ++++++++++++++++++---------------------
 src/GridapDistributed.jl |   6 +-
 3 files changed, 156 insertions(+), 134 deletions(-)
 create mode 100644 src/DistributedUtils.jl

diff --git a/src/DistributedUtils.jl b/src/DistributedUtils.jl
new file mode 100644
index 00000000..0b34a0a1
--- /dev/null
+++ b/src/DistributedUtils.jl
@@ -0,0 +1,44 @@
+
+function permuted_variable_partition(
+  n_own::AbstractArray{<:Integer},
+  gids::AbstractArray{<:AbstractArray{<:Integer}},
+  owners::AbstractArray{<:AbstractArray{<:Integer}};
+  n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))),
+  start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
+)
+  ranks = linear_indices(n_own)
+  np = length(ranks)
+  map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners
+    n_local = length(gids)
+    n_ghost = n_local - n_own
+    perm = fill(zero(Int32),n_local)
+    ghost_gids = fill(zero(Int),n_ghost)
+    ghost_owners = fill(zero(Int32),n_ghost)
+
+    n_ghost = 0
+    for (lid,(gid,owner)) in enumerate(zip(gids,owners))
+      if owner == rank
+        perm[lid] = gid-start+1
+      else
+        n_ghost += 1
+        ghost_gids[n_ghost] = gid
+        ghost_owners[n_ghost] = owner
+        perm[lid] = n_own + n_ghost
+      end
+    end
+    @assert n_ghost == n_local - n_own
+
+    ghost = GhostIndices(n_global,ghost_gids,ghost_owners)
+    dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize(
+      CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost
+    )
+    permute_indices(dof_ids,perm)
+  end
+end
+
+function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T
+  ptrs = Vector{Int32}(undef,length(vv)+1)
+  Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv)
+  Arrays.length_to_ptrs!(ptrs)
+  return ptrs
+end
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 844ec9e6..02530cfa 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -56,36 +56,25 @@ function FESpaces.gather_free_and_dirichlet_values(f::DistributedFESpace,cell_va
   return free_values, dirichlet_values
 end
 
-function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_prange)
-  map(cell_wise_vector,
-          dof_wise_vector,
-          cell_to_ldofs,
-          partition(cell_prange)) do cwv,dwv,cell_to_ldofs,indices
+function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids)
+  map(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids
     cache  = array_cache(cell_to_ldofs)
-    ncells = length(cell_to_ldofs)
-    ptrs = cwv.ptrs
-    data = cwv.data
-    cell_own_to_local = own_to_local(indices)
-    for cell in cell_own_to_local
+    for cell in cell_ids
       ldofs = getindex!(cache,cell_to_ldofs,cell)
-      p = ptrs[cell]-1
+      p = cwv.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
         if ldof > 0
-          data[i+p] = dwv[ldof]
+          cwv.data[i+p] = dwv[ldof]
         end
       end
     end
   end
 end
 
-function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_range)
-  map(dof_wise_vector,
-      cell_wise_vector,
-      cell_to_ldofs,
-      partition(cell_range)) do dwv,cwv,cell_to_ldofs,indices
+function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids)
+  map(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids
     cache = array_cache(cell_to_ldofs)
-    cell_ghost_to_local = ghost_to_local(indices)
-    for cell in cell_ghost_to_local
+    for cell in cell_ids
       ldofs = getindex!(cache,cell_to_ldofs,cell)
       p = cwv.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
@@ -97,23 +86,14 @@ function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,c
   end
 end
 
-function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_prange)
-    cwv=map(cell_to_ldofs) do cell_to_ldofs
-      cache = array_cache(cell_to_ldofs)
-      ncells = length(cell_to_ldofs)
-      ptrs = Vector{Int32}(undef,ncells+1)
-      for cell in 1:ncells
-        ldofs = getindex!(cache,cell_to_ldofs,cell)
-        ptrs[cell+1] = length(ldofs)
-      end
-      PArrays.length_to_ptrs!(ptrs)
-      ndata = ptrs[end]-1
-      data = Vector{Int}(undef,ndata)
-      data .= -1
-      JaggedArray(data,ptrs)
-    end
-    dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_prange)
-    cwv
+function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_ids)
+  cwv = map(cell_to_ldofs) do cell_to_ldofs
+    ptrs = generate_ptrs(cell_to_ldofs)
+    data = fill(-1,ptrs[end]-1)
+    JaggedArray(data,ptrs)
+  end
+  dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_ids)
+  return cwv
 end
 
 function fetch_vector_ghost_values_cache(vector_partition,partition)
@@ -128,13 +108,15 @@ end
 function generate_gids(
   cell_range::PRange,
   cell_to_ldofs::AbstractArray{<:AbstractArray},
-  nldofs::AbstractArray{<:Integer})
+  nldofs::AbstractArray{<:Integer}
+)
+  ranks = linear_indices(partition(cell_range))
 
   # Find and count number owned dofs
   ldof_to_owner, nodofs = map(partition(cell_range),cell_to_ldofs,nldofs) do indices,cell_to_ldofs,nldofs
     ldof_to_owner = fill(Int32(0),nldofs)
-    cache = array_cache(cell_to_ldofs)
     lcell_to_owner = local_to_owner(indices)
+    cache = array_cache(cell_to_ldofs)
     for cell in 1:length(cell_to_ldofs)
       owner = lcell_to_owner[cell]
       ldofs = getindex!(cache,cell_to_ldofs,cell)
@@ -151,102 +133,54 @@ function generate_gids(
     ldof_to_owner, nodofs
   end |> tuple_of_arrays
 
-  cell_ldofs_to_part = dof_wise_to_cell_wise(ldof_to_owner,
-                                              cell_to_ldofs,
-                                              cell_range)
-
-  # Note1 : this call potentially updates cell_prange with the 
-  #         info required to exchange info among nearest neighbours
-  #         so that it can be re-used in the future for other exchanges
-
-  # Note2 : we need to call reverse() as the senders and receivers are 
-  #         swapped in the AssemblyCache of partition(cell_range)
-
-  # Exchange the dof owners
-  cache_fetch = fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range))
-  fetch_vector_ghost_values!(cell_ldofs_to_part,cache_fetch) |> wait
-  
-  cell_wise_to_dof_wise!(ldof_to_owner,
-                         cell_ldofs_to_part,
-                         cell_to_ldofs,
-                         cell_range)
-
   # Find the global range of owned dofs
   first_gdof = scan(+,nodofs,type=:exclusive,init=one(eltype(nodofs)))
+
+  # Exchange the dof owners. Cell owner always has correct dof owner.
+  cell_ldofs_to_owner = dof_wise_to_cell_wise(ldof_to_owner,cell_to_ldofs,own_to_local(cell_range))
+  consistent!(PVector(cell_ldofs_to_owner,partition(cell_range))) |> wait
+  cell_wise_to_dof_wise!(ldof_to_owner,cell_ldofs_to_owner,cell_to_ldofs,ghost_to_local(cell_range))
   
-  # Distribute gdofs to owned ones
-  ldof_to_gdof = map(first_gdof,ldof_to_owner,partition(cell_range)) do first_gdof,ldof_to_owner,indices
-    me = part_id(indices)
+  # Fill owned gids
+  ldof_to_gdof = map(ranks,first_gdof,ldof_to_owner) do rank,first_gdof,ldof_to_owner
+    ldof_to_gdof = fill(0,length(ldof_to_owner))
     offset = first_gdof-1
-    ldof_to_gdof = Vector{Int}(undef,length(ldof_to_owner))
     odof = 0
     for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == me
+      if owner == rank
         odof += 1
-        ldof_to_gdof[ldof] = odof
-      else
-        ldof_to_gdof[ldof] = 0
-      end
-    end
-    for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == me
-        ldof_to_gdof[ldof] += offset
+        ldof_to_gdof[ldof] = odof + offset
       end
     end
     ldof_to_gdof
   end
 
-  # Create cell-wise global dofs
-  cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof,
-                                        cell_to_ldofs,
-                                        cell_range)
+  # Exchange gids
+  cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range))
+  consistent!(PVector(cell_to_gdofs,partition(cell_range))) |> wait
 
-  # Exchange the global dofs
-  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
-
-  # Distribute global dof ids also to ghost
+  # Fill ghost gids with exchanged information
   map(cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices
-    gdof = 0
     cache = array_cache(cell_to_ldofs)
-    cell_ghost_to_local = ghost_to_local(indices)
     cell_local_to_owner = local_to_owner(indices)
-    for cell in cell_ghost_to_local
+    for cell in ghost_to_local(indices)
       ldofs = getindex!(cache,cell_to_ldofs,cell)
+      cell_owner = cell_local_to_owner[cell]
       p = cell_to_gdofs.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
-        if ldof > 0 && ldof_to_owner[ldof] == cell_local_to_owner[cell]
+        dof_owner = ldof_to_owner[ldof]
+        if ldof > 0 && (dof_owner == cell_owner)
           ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p]
         end
       end
     end
   end
 
-  dof_wise_to_cell_wise!(cell_to_gdofs,
-                         ldof_to_gdof,
-                         cell_to_ldofs,
-                         cell_range)
-
-  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
-
-  cell_wise_to_dof_wise!(ldof_to_gdof,
-                         cell_to_gdofs,
-                         cell_to_ldofs,
-                         cell_range)
-
-  # Setup DoFs LocalIndices
-  ngdofs = reduction(+,nodofs,destination=:all,init=zero(eltype(nodofs)))
-  local_indices = map(ngdofs,partition(cell_range),ldof_to_gdof,ldof_to_owner) do ngdofs,
-                                                                                  indices,
-                                                                                  ldof_to_gdof,
-                                                                                  ldof_to_owner
-     me = part_id(indices)
-     LocalIndices(ngdofs,me,ldof_to_gdof,ldof_to_owner)
-  end
-
-  # Setup dof range
-  dofs_range = PRange(local_indices)
-
-  return dofs_range
+  # Setup DoFs AbstractLocalIndices
+  dof_ids = permuted_variable_partition(
+    nodofs,ldof_to_gdof,ldof_to_owner;start=first_gdof
+  )
+  return PRange(dof_ids)
 end
 
 # FEFunction related
@@ -576,12 +510,14 @@ end
 function FESpaces.collect_cell_matrix(
   trial::DistributedFESpace,
   test::DistributedFESpace,
-  a::DistributedDomainContribution)
+  a::DistributedDomainContribution
+)
   map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a))
 end
 
 function FESpaces.collect_cell_vector(
-  test::DistributedFESpace, a::DistributedDomainContribution)
+  test::DistributedFESpace, a::DistributedDomainContribution
+)
   map(collect_cell_vector,local_views(test),local_views(a))
 end
 
@@ -589,12 +525,14 @@ function FESpaces.collect_cell_matrix_and_vector(
   trial::DistributedFESpace,
   test::DistributedFESpace,
   biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution)
+  liform::DistributedDomainContribution
+)
   map(collect_cell_matrix_and_vector,
     local_views(trial),
     local_views(test),
     local_views(biform),
-    local_views(liform))
+    local_views(liform)
+  )
 end
 
 function FESpaces.collect_cell_matrix_and_vector(
@@ -602,17 +540,20 @@ function FESpaces.collect_cell_matrix_and_vector(
   test::DistributedFESpace,
   biform::DistributedDomainContribution,
   liform::DistributedDomainContribution,
-  uhd)
+  uhd
+)
   map(collect_cell_matrix_and_vector,
     local_views(trial),
     local_views(test),
     local_views(biform),
     local_views(liform),
-    local_views(uhd))
+    local_views(uhd)
+  )
 end
 
 function FESpaces.collect_cell_vector(
-  test::DistributedFESpace,l::Number)
+  test::DistributedFESpace,l::Number
+)
   map(local_views(test)) do s
     collect_cell_vector(s,l)
   end
@@ -622,7 +563,8 @@ function FESpaces.collect_cell_matrix_and_vector(
   trial::DistributedFESpace,
   test::DistributedFESpace,
   mat::DistributedDomainContribution,
-  l::Number)
+  l::Number
+)
   map(local_views(trial),local_views(test),local_views(mat)) do u,v,m
     collect_cell_matrix_and_vector(u,v,m,l)
   end
@@ -633,26 +575,29 @@ function FESpaces.collect_cell_matrix_and_vector(
   test::DistributedFESpace,
   mat::DistributedDomainContribution,
   l::Number,
-  uhd)
+  uhd
+)
   map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f
     collect_cell_matrix_and_vector(u,v,m,l,f)
   end
 end
+
 """
 """
-struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
+struct DistributedSparseMatrixAssembler{A,B,C,D,E,F,G} <: SparseMatrixAssembler
   strategy::A
   assems::B
   matrix_builder::C
   vector_builder::D
-  test_dofs_gids_prange::E
-  trial_dofs_gids_prange::F
+  test_gids::E
+  trial_gids::F
+  caches::G
 end
 
 local_views(a::DistributedSparseMatrixAssembler) = a.assems
 
-FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_dofs_gids_prange
-FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_dofs_gids_prange
+FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
+FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
 FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
 FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
 FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
@@ -694,20 +639,43 @@ function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrix
   map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
 end
 
+function FESpaces.assemble_matrix!(mat::PSparseMatrix,a::DistributedSparseMatrixAssembler,matdata)
+  @check !isnothing(a.caches) "Assembler is not reusable!"
+  @check haskey(a.caches,objectid(mat)) "Cache not found!"
+  cache = a.caches[objectid(mat)]
+
+  LinearAlgebra.fillstored!(mat,zero(eltype(mat)))
+
+  counter = nz_counter(get_matrix_builder(a),(get_rows(a),get_cols(a)))
+  map(local_views(counter),partition(mat)) do counter, mat
+    counter.nnz = nnz(mat)
+  end
+  allocs = nz_allocation(counter)
+  numeric_loop_matrix!(allocs,a,matdata)
+
+  create_from_nz!(mat,allocs,cache)
+end
+
+function create_from_nz!(A,a::DistributedAllocationCOO{<:Assembled},cache)
+  _,_,V = get_allocations(a)
+  psparse!(A,V,cache) |> wait
+  return A
+end
+
 # Parallel Assembly strategies
 
-function local_assembly_strategy(::SubAssembledRows,rows,cols)
+function local_assembly_strategy(::Assembled,rows,cols)
   DefaultAssemblyStrategy()
 end
 
 # When using this one, make sure that you also loop over ghost cells.
 # This is at your own risk.
-function local_assembly_strategy(::FullyAssembledRows,rows,cols)
+function local_assembly_strategy(::LocallyAssembled,rows,cols)
   rows_local_to_ghost = local_to_ghost(rows)
   GenericAssemblyStrategy(
     identity,
     identity,
-    row->rows_local_to_ghost[row]==0,
+    row->iszero(rows_local_to_ghost[row]),
     col->true
   )
 end
@@ -718,7 +686,8 @@ function FESpaces.SparseMatrixAssembler(
   local_vec_type,
   rows::PRange,
   cols::PRange,
-  par_strategy=SubAssembledRows()
+  par_strategy=Assembled();
+  reuse_caches=false
 )
   assems = map(partition(rows),partition(cols)) do rows,cols
     local_strategy = local_assembly_strategy(par_strategy,rows,cols)
@@ -732,7 +701,12 @@ function FESpaces.SparseMatrixAssembler(
   end
   mat_builder = PSparseMatrixBuilderCOO(local_mat_type,par_strategy)
   vec_builder = PVectorBuilder(local_vec_type,par_strategy)
-  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
+  if reuse_caches
+    caches = Dict{UInt64,Any}()
+  else
+    caches = nothing
+  end
+  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols,caches)
 end
 
 function FESpaces.SparseMatrixAssembler(
@@ -740,22 +714,24 @@ function FESpaces.SparseMatrixAssembler(
   local_vec_type,
   trial::DistributedFESpace,
   test::DistributedFESpace,
-  par_strategy=SubAssembledRows()
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
 )
   rows = get_free_dof_ids(test)
   cols = get_free_dof_ids(trial)
-  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy)
+  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
 end
 
 function FESpaces.SparseMatrixAssembler(
   trial::DistributedFESpace,
   test::DistributedFESpace,
-  par_strategy=SubAssembledRows()
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
 )
   Tv = PartitionedArrays.getany(map(get_vector_type,local_views(trial)))
   T  = eltype(Tv)
   Tm = SparseMatrixCSC{T,Int}
-  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy)
+  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
 end
 
 # ZeroMean FESpace
diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl
index 8c20a975..7edf2a48 100644
--- a/src/GridapDistributed.jl
+++ b/src/GridapDistributed.jl
@@ -31,8 +31,8 @@ import Base: inv, abs, abs2, *, +, -, /, adjoint, transpose, real, imag, conj, g
 import Gridap.Fields: grad2curl
 import Gridap.CellData: Interpolable
 
-export FullyAssembledRows
-export SubAssembledRows
+export LocallyAssembled
+export Assembled
 
 export get_cell_gids
 export get_face_gids
@@ -42,6 +42,8 @@ export with_ghost, no_ghost
 
 export redistribute
 
+include("DistributedUtils.jl")
+
 include("BlockPartitionedArrays.jl")
 
 include("Algebra.jl")

From aab58d445844909236f3b3f1852261f334295fbf Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Sat, 7 Sep 2024 00:18:42 +1000
Subject: [PATCH 04/14] Started re-structuring assembly

---
 src/Algebra.jl                           | 1143 --------------------
 src/Assembly.jl                          |  199 ++++
 src/DistributedUtils.jl                  |   44 -
 src/Geometry.jl                          |    4 +-
 src/GridapDistributed.jl                 |    5 +-
 src/GridapExtras.jl                      |   55 +
 src/MultiField.jl                        |    2 +-
 src/PArraysExtras.jl                     |  210 ++++
 src/_Algebra_old.jl                      | 1238 ++++++++++++++++++++++
 test/BlockSparseMatrixAssemblersTests.jl |    4 +-
 test/DivConformingTests.jl               |    2 +-
 test/FESpacesTests.jl                    |    4 +-
 test/HeatEquationTests.jl                |    2 +-
 test/PLaplacianTests.jl                  |    4 +-
 test/parrays_update.jl                   |   51 +
 15 files changed, 1768 insertions(+), 1199 deletions(-)
 create mode 100644 src/Assembly.jl
 delete mode 100644 src/DistributedUtils.jl
 create mode 100644 src/GridapExtras.jl
 create mode 100644 src/PArraysExtras.jl
 create mode 100644 src/_Algebra_old.jl

diff --git a/src/Algebra.jl b/src/Algebra.jl
index 11af9149..3fa0b502 100644
--- a/src/Algebra.jl
+++ b/src/Algebra.jl
@@ -29,192 +29,6 @@ function Algebra.allocate_in_domain(matrix::BlockPMatrix)
   allocate_in_domain(BlockPVector{V},matrix)
 end
 
-# PSparseMatrix copy
-
-function Base.copy(a::PSparseMatrix)
-  mats = map(copy,partition(a))
-  cache = map(PartitionedArrays.copy_cache,a.cache)
-  return PSparseMatrix(mats,partition(axes(a,1)),partition(axes(a,2)),cache)
-end
-
-# PartitionedArrays extras
-
-function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
-  @check partition(axes(x,1)) === partition(axes(y,1))
-  map(partition(x),partition(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  consistent!(y) |> wait
-  return y
-end
-
-function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
-  map(blocks(x),blocks(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  return y
-end
-
-function Algebra.axpy_entries!(
-  α::Number, A::PSparseMatrix, B::PSparseMatrix;
-  check::Bool=true
-)
-# We should definitely check here that the index partitions are the same. 
-# However: Because the different matrices are assembled separately, the objects are not the 
-# same (i.e can't use ===). Checking the index partitions would then be costly...
-  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
-  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
-  map(partition(A),partition(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
-
-function Algebra.axpy_entries!(
-  α::Number, A::BlockPMatrix, B::BlockPMatrix;
-  check::Bool=true
-)
-  map(blocks(A),blocks(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.ArrayCounter,axes)
-  @notimplemented
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
-  b = Algebra.CounterCOO{T}(axes)
-  b.nnz = a.nnz
-  b
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
-  counter = change_axes(a.counter,axes)
-  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
-end
-
-# Array of PArrays -> PArray of Arrays 
-function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      PartitionedArrays.getany(aj)
-    end
-  end
-end
-
-function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      aj.items[i]
-    end
-  end
-end
-
-# This type is required because MPIArray from PArrays 
-# cannot be instantiated with a NULL communicator
-struct MPIVoidVector{T} <: AbstractVector{T}
-  comm::MPI.Comm
-  function MPIVoidVector(::Type{T}) where {T}
-    new{T}(MPI.COMM_NULL)
-  end
-end
-
-Base.size(a::MPIVoidVector) = (0,)
-Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
-function Base.getindex(a::MPIVoidVector,i::Int)
-  error("Indexing of MPIVoidVector not possible.")
-end
-function Base.setindex!(a::MPIVoidVector,v,i::Int)
-  error("Indexing of MPIVoidVector not possible.")
-end
-function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
-  println(io,"MPIVoidVector")
-end
-
-function num_parts(comm::MPI.Comm)
-  if comm != MPI.COMM_NULL
-    nparts = MPI.Comm_size(comm)
-  else
-    nparts = -1
-  end
-  nparts
-end
-@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
-@inline num_parts(comm::DebugArray) = length(comm.items)
-@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
-
-function get_part_id(comm::MPI.Comm)
-  if comm != MPI.COMM_NULL
-    id = MPI.Comm_rank(comm)+1
-  else
-    id = -1
-  end
-  id
-end
-@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
-@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
-
-"""
-    i_am_in(comm::MPIArray)
-    i_am_in(comm::DebugArray)
-  
-  Returns `true` if the processor is part of the subcommunicator `comm`.
-"""
-function i_am_in(comm::MPI.Comm)
-  get_part_id(comm) >=0
-end
-@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
-@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
-@inline i_am_in(comm::DebugArray) = true
-
-function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
-  x_new = map(new_parts) do p
-    if isa(x,MPIArray)
-      PartitionedArrays.getany(x)
-    elseif isa(x,DebugArray) && (p <= length(x.items))
-      x.items[p]
-    else
-      default
-    end
-  end
-  return x_new
-end
-
-function generate_subparts(parts::MPIArray,new_comm_size)
-  root_comm = parts.comm
-  root_size = MPI.Comm_size(root_comm)
-  rank = MPI.Comm_rank(root_comm)
-
-  @static if isdefined(MPI,:MPI_UNDEFINED)
-    mpi_undefined = MPI.MPI_UNDEFINED[]
-  else
-    mpi_undefined = MPI.API.MPI_UNDEFINED[]
-  end
-  
-  if root_size == new_comm_size
-    return parts
-  else
-    if rank < new_comm_size
-      comm = MPI.Comm_split(root_comm,0,0)
-      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
-    else
-      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
-      return MPIVoidVector(eltype(parts))
-    end
-  end
-end
-
-function generate_subparts(parts::DebugArray,new_comm_size)
-  DebugArray(LinearIndices((new_comm_size,)))
-end
-
 # local_views
 
 function local_views(a)
@@ -284,960 +98,3 @@ function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_
   end
   return BlockPVector(vals,ids)
 end
-
-# This function computes a mapping among the local identifiers of a and b
-# for which the corresponding global identifiers are both in a and b. 
-# Note that the haskey check is necessary because in the general case 
-# there might be gids in b which are not present in a
-function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
-  a_local_to_b_local = fill(Int32(-1),local_length(a))
-  b_local_to_global  = local_to_global(b)
-  a_global_to_local  = global_to_local(a)
-  for blid in 1:local_length(b)
-    gid = b_local_to_global[blid]
-    if a_global_to_local[gid] != zero(eltype(a_global_to_local))
-      alid = a_global_to_local[gid]
-      a_local_to_b_local[alid] = blid
-    end  
-  end
-  a_local_to_b_local
-end
-
-# This type is required in order to be able to access the local portion 
-# of distributed sparse matrices and vectors using local indices from the 
-# distributed test and trial spaces
-struct LocalView{T,N,A,B} <:AbstractArray{T,N}
-  plids_to_value::A
-  d_to_lid_to_plid::B
-  local_size::NTuple{N,Int}
-  function LocalView(
-    plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N}
-    A = typeof(plids_to_value)
-    B = typeof(d_to_lid_to_plid)
-    local_size = map(length,d_to_lid_to_plid)
-    new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size)
-  end
-end
-
-Base.size(a::LocalView) = a.local_size
-Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian()
-function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
-  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
-  if all(i->i>0,plids)
-    a.plids_to_value[plids...]
-  else
-    zero(T)
-  end
-end
-function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N}
-  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
-  @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion"
-  a.plids_to_value[plids...] = v
-end
-
-function _lid_to_plid(lid,lid_to_plid)
-  plid = lid_to_plid[lid]
-  plid
-end
-
-function local_views(a::PVector,new_rows::PRange)
-  old_rows = axes(a,1)
-  if partition(old_rows) === partition(new_rows)
-    partition(a)
-  else
-    map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows
-      LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),))
-    end
-  end
-end
-
-function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange)
-  old_rows, old_cols = axes(a)
-  if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) )
-    partition(a)
-  else
-    map(partition(a),
-        partition(old_rows),partition(old_cols),
-        partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols
-      rl2lmap = find_local_to_local_map(new_rows,old_rows)
-      cl2lmap = find_local_to_local_map(new_cols,old_cols)
-      LocalView(matrix_partition,(rl2lmap,cl2lmap))
-    end
-  end
-end
-
-function local_views(a::BlockPVector,new_rows::BlockPRange)
-  vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays
-  return map(mortar,vals)
-end
-
-function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange)
-  vals = map(CartesianIndices(blocksize(a))) do I
-    local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I])
-  end |> to_parray_of_arrays
-  return map(mortar,vals)
-end
-
-# PSparseMatrix assembly
-
-struct FullyAssembledRows end
-struct SubAssembledRows end
-
-# For the moment we use COO format even though
-# it is quite memory consuming.
-# We need to implement communication in other formats in
-# PartitionedArrays.jl
-
-struct PSparseMatrixBuilderCOO{T,B}
-  local_matrix_type::Type{T}
-  par_strategy::B
-end
-
-function Algebra.nz_counter(
-  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}
-) where A
-  rows, cols = axs # test ids, trial ids
-  counters = map(partition(rows),partition(cols)) do r,c
-    axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c)))
-    Algebra.CounterCOO{A}(axs)
-  end
-  DistributedCounterCOO(builder.par_strategy,counters,rows,cols)
-end
-
-function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv
-  T = eltype(Tv)
-  return PSparseMatrix{T}
-end
-
-
-"""
-"""
-struct DistributedCounterCOO{A,B,C,D} <: GridapType
-  par_strategy::A
-  counters::B
-  test_gids::C
-  trial_gids::D
-  function DistributedCounterCOO(
-    par_strategy,
-    counters::AbstractArray{<:Algebra.CounterCOO},
-    test_gids::PRange,
-    trial_gids::PRange)
-    A = typeof(par_strategy)
-    B = typeof(counters)
-    C = typeof(test_gids)
-    D = typeof(trial_gids)
-    new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids)
-  end
-end
-
-function local_views(a::DistributedCounterCOO)
-  return a.counters
-end
-
-function local_views(a::DistributedCounterCOO,test_gids,trial_gids)
-  @check PArrays.matching_local_indices(test_gids,a.test_gids)
-  @check PArrays.matching_local_indices(trial_gids,a.trial_gids)
-  return a.counters
-end
-
-function Algebra.nz_allocation(a::DistributedCounterCOO)
-  allocs = map(nz_allocation,a.counters)
-  DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids)
-end
-
-struct DistributedAllocationCOO{A,B,C,D} <: GridapType
-  par_strategy::A
-  allocs::B
-  test_gids::C
-  trial_gids::D
-  function DistributedAllocationCOO(
-    par_strategy,
-    allocs::AbstractArray{<:Algebra.AllocationCOO},
-    test_gids::PRange,
-    trial_gids::PRange)
-    A = typeof(par_strategy)
-    B = typeof(allocs)
-    C = typeof(test_gids)
-    D = typeof(trial_gids)
-    new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids)
-  end
-end
-
-function change_axes(
-  a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
-  axes::Tuple{<:PRange,<:PRange}
-) where {A,B}
-  local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols
-    (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols)))
-  end
-  allocs = map(change_axes,a.allocs,local_axes)
-  DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2])
-end
-
-function change_axes(
-  a::MatrixBlock{<:DistributedAllocationCOO},
-  axes::Tuple{<:Vector,<:Vector}
-)
-  block_ids  = CartesianIndices(a.array)
-  rows, cols = axes
-  array = map(block_ids) do I
-    change_axes(a[I],(rows[I[1]],cols[I[2]]))
-  end
-  return ArrayBlock(array,a.touched)
-end
-
-function local_views(a::DistributedAllocationCOO)
-  a.allocs
-end
-
-function local_views(a::DistributedAllocationCOO,test_gids,trial_gids)
-  @check test_gids === a.test_gids
-  @check trial_gids === a.trial_gids
-  a.allocs
-end
-
-function local_views(a::MatrixBlock{<:DistributedAllocationCOO})
-  array = map(local_views,a.array) |> to_parray_of_arrays
-  return map(ai -> ArrayBlock(ai,a.touched),array)
-end
-
-function get_allocations(a::DistributedAllocationCOO)
-  I,J,V = map(local_views(a)) do alloc
-    alloc.I, alloc.J, alloc.V
-  end |> tuple_of_arrays
-  return I,J,V
-end
-
-function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO})
-  tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays
-  return tuple_of_array_of_parrays
-end
-
-get_test_gids(a::DistributedAllocationCOO)  = a.test_gids
-get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids
-get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO})  = map(get_test_gids,diag(a.array))
-get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array))
-
-function Algebra.create_from_nz(a::PSparseMatrix)
-  # For FullyAssembledRows the underlying Exchanger should
-  # not have ghost layer making assemble! do nothing (TODO check)
-  assemble!(a) |> wait
-  a
-end
-
-function Algebra.create_from_nz(a::DistributedAllocationCOO{<:FullyAssembledRows})
-  f(x) = nothing
-  A, = _fa_create_from_nz_with_callback(f,a)
-  return A
-end
-
-function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:FullyAssembledRows}})
-  f(x) = nothing
-  A, = _fa_create_from_nz_with_callback(f,a)
-  return A
-end
-
-function _fa_create_from_nz_with_callback(callback,a)
-
-  # Recover some data
-  I,J,V = get_allocations(a)
-  test_gids  = get_test_gids(a)
-  trial_gids = get_trial_gids(a)
-
-  rows = _setup_prange(test_gids,I;ghost=false,ax=:rows)
-  b = callback(rows)
-
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_gids;ax=:rows)
-  to_global_indices!(J,trial_gids;ax=:cols)
-
-  # Create the range for cols
-  cols = _setup_prange(trial_gids,J;ax=:cols)
-
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
-
-  A = _setup_matrix(I,J,V,rows,cols)
-  return A, b
-end
-
-function Algebra.create_from_nz(a::DistributedAllocationCOO{<:SubAssembledRows})
-  f(x) = nothing
-  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
-  return A
-end
-
-function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:SubAssembledRows}})
-  f(x) = nothing
-  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
-  return A
-end
-
-function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
-  # Recover some data
-  I,J,V = get_allocations(a)
-  test_gids = get_test_gids(a)
-  trial_gids = get_trial_gids(a)
-
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_gids;ax=:rows)
-  to_global_indices!(J,trial_gids;ax=:cols)
-
-  # Create the Prange for the rows
-  rows = _setup_prange(test_gids,I;ax=:rows)
-  
-  # Move (I,J,V) triplets to the owner process of each row I.
-  # The triplets are accompanyed which Jo which is the process column owner
-  Jo = get_gid_owners(J,trial_gids;ax=:cols)
-  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
-
-  # Here we can overlap computations
-  # This is a good place to overlap since
-  # sending the matrix rows is a lot of data
-  if !isa(b,Nothing)
-    bprange=_setup_prange_from_pvector_allocation(b)
-    b = callback(bprange)
-  end
-
-  # Wait the transfer to finish
-  wait(t)
-
-  # Create the Prange for the cols
-  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
-
-  # Overlap rhs communications with CSC compression
-  t2 = async_callback(b)
-
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
-
-  A = _setup_matrix(I,J,V,rows,cols)
-
-  # Wait the transfer to finish
-  if !isa(t2,Nothing)
-    wait(t2)
-  end
-  
-  return A, b
-end
-
-
-# PVector assembly 
-
-struct PVectorBuilder{T,B}
-  local_vector_type::Type{T}
-  par_strategy::B
-end
-
-function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange})
-  T = builder.local_vector_type
-  rows, = axs
-  counters = map(partition(rows)) do rows
-    axs = (Base.OneTo(local_length(rows)),)
-    nz_counter(ArrayBuilder(T),axs)
-  end
-  PVectorCounter(builder.par_strategy,counters,rows)
-end
-
-function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
-  T = eltype(Tv)
-  return PVector{T}
-end
-
-struct PVectorCounter{A,B,C}
-  par_strategy::A
-  counters::B
-  test_gids::C
-end
-
-Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
-
-function local_views(a::PVectorCounter)
-  a.counters
-end
-
-function local_views(a::PVectorCounter,rows)
-  @check rows === a.test_gids
-  a.counters
-end
-
-function Arrays.nz_allocation(a::PVectorCounter{<:FullyAssembledRows})
-  dofs = a.test_gids
-  values = map(nz_allocation,a.counters)
-  PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs)
-end
-
-struct PVectorAllocationTrackOnlyValues{A,B,C}
-  par_strategy::A
-  values::B
-  test_gids::C
-end
-
-function local_views(a::PVectorAllocationTrackOnlyValues)
-  a.values
-end
-
-function local_views(a::PVectorAllocationTrackOnlyValues,rows)
-  @check rows === a.test_gids
-  a.values
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows})
-  rows = _setup_prange_without_ghosts(a.test_gids)
-  _rhs_callback(a,rows)
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:SubAssembledRows})
-  # This point MUST NEVER be reached. If reached there is an inconsistency
-  # in the parallel code in charge of vector assembly
-  @assert false
-end
-
-function _rhs_callback(row_partitioned_vector_partition,rows)
-  # The ghost values in row_partitioned_vector_partition are 
-  # aligned with the FESpace but not with the ghost values in the rows of A
-  b_fespace = PVector(row_partitioned_vector_partition.values,
-                      partition(row_partitioned_vector_partition.test_gids))
-
-  # This one is aligned with the rows of A
-  b = similar(b_fespace,eltype(b_fespace),(rows,))
-
-  # First transfer owned values
-  b .= b_fespace
-
-  # Now transfer ghost
-  function transfer_ghost(b,b_fespace,ids,ids_fespace)
-    num_ghosts_vec = ghost_length(ids)
-    gho_to_loc_vec = ghost_to_local(ids)
-    loc_to_glo_vec = local_to_global(ids)
-    gid_to_lid_fe  = global_to_local(ids_fespace)
-    for ghost_lid_vec in 1:num_ghosts_vec
-      lid_vec     = gho_to_loc_vec[ghost_lid_vec]
-      gid         = loc_to_glo_vec[lid_vec]
-      lid_fespace = gid_to_lid_fe[gid]
-      b[lid_vec] = b_fespace[lid_fespace]
-    end
-  end
-  map(
-    transfer_ghost,
-    partition(b),
-    partition(b_fespace),
-    b.index_partition,
-    b_fespace.index_partition)
-
-  return b
-end
-
-function Algebra.create_from_nz(a::PVector)
-  assemble!(a) |> wait
-  return a
-end
-
-function Algebra.create_from_nz(
-  a::DistributedAllocationCOO{<:FullyAssembledRows},
-  c_fespace::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows})
-
-  function callback(rows)
-    _rhs_callback(c_fespace,rows)
-  end
-
-  A,b = _fa_create_from_nz_with_callback(callback,a)
-  return A,b
-end
-
-struct PVectorAllocationTrackTouchedAndValues{A,B,C}
-  allocations::A
-  values::B
-  test_gids::C
-end
-
-function Algebra.create_from_nz(
-  a::DistributedAllocationCOO{<:SubAssembledRows},
-  b::PVectorAllocationTrackTouchedAndValues)
-
-  function callback(rows)
-    _rhs_callback(b,rows)
-  end
-
-  function async_callback(b)
-    # now we can assemble contributions
-    assemble!(b)
-  end
-
-  A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
-  return A,b
-end
-
-struct ArrayAllocationTrackTouchedAndValues{A}
-  touched::Vector{Bool}
-  values::A
-end
-
-Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop()
-
-
-function local_views(a::PVectorAllocationTrackTouchedAndValues,rows)
-  @check rows === a.test_gids
-  a.allocations
-end
-
-@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
-  @notimplemented
-end
-@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
-  if i>0
-    if !(a.touched[i])
-      a.touched[i]=true
-    end
-    if !isa(v,Nothing)
-      a.values[i]=c(v,a.values[i])
-    end
-  end
-  nothing
-end
-@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
-  @notimplemented
-end
-@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
-  if !isa(v,Nothing)
-    for (ve,ie) in zip(v,i)
-      Arrays.add_entry!(c,a,ve,ie)
-    end
-  else
-    for ie in i
-      Arrays.add_entry!(c,a,nothing,ie)
-    end
-  end
-  nothing
-end
-
-
-function _setup_touched_and_allocations_arrays(values)
-  touched = map(values) do values
-    fill!(Vector{Bool}(undef,length(values)),false)
-  end
-  allocations = map(values,touched) do values,touched
-   ArrayAllocationTrackTouchedAndValues(touched,values)
-  end
-  touched, allocations
-end
-
-function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows},
-                              b::PVectorCounter{<:SubAssembledRows})
-  A      = nz_allocation(a)
-  dofs   = b.test_gids
-  values = map(nz_allocation,b.counters)
-  touched,allocations=_setup_touched_and_allocations_arrays(values)
-  B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
-  return A,B
-end
-
-function Arrays.nz_allocation(a::PVectorCounter{<:SubAssembledRows})
-  dofs = a.test_gids
-  values = map(nz_allocation,a.counters)
-  touched,allocations=_setup_touched_and_allocations_arrays(values)
-  return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
-end
-
-function local_views(a::PVectorAllocationTrackTouchedAndValues)
-  a.allocations
-end
-
-function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues)
-
-  # Find the ghost rows
-  allocations = local_views(a.allocations)
-  indices = partition(a.test_gids)
-  I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices
-    dofs_lids_touched = findall(allocation.touched)
-    loc_to_gho = local_to_ghost(indices)
-    n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched)
-    I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids)
-    cur = 1
-    for lid in dofs_lids_touched
-      dof_lid = loc_to_gho[lid]
-      if dof_lid != 0
-        I_ghost_lids[cur] = dof_lid
-        cur = cur+1
-      end
-    end
-    I_ghost_lids
-  end
-
-  ghost_to_global, ghost_to_owner = map(
-    find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays
-
-  ngids = length(a.test_gids)
-  _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner)
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues)
-  rows = _setup_prange_from_pvector_allocation(a)
-  b    = _rhs_callback(a,rows)
-  t2   = assemble!(b)
-  # Wait the transfer to finish
-  if t2 !== nothing
-    wait(t2)
-  end
-  return b
-end
-
-# Common Assembly Utilities
-function first_gdof_from_ids(ids)
-  if own_length(ids) == 0
-    return 1
-  end
-  lid_to_gid   = local_to_global(ids) 
-  owned_to_lid = own_to_local(ids)
-  return Int(lid_to_gid[first(owned_to_lid)])
-end
-
-function find_gid_and_owner(ighost_to_jghost,jindices)
-  jghost_to_local  = ghost_to_local(jindices)
-  jlocal_to_global = local_to_global(jindices)
-  jlocal_to_owner  = local_to_owner(jindices)
-  ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost))
-
-  ighost_to_global = jlocal_to_global[ighost_to_jlocal]
-  ighost_to_owner  = jlocal_to_owner[ighost_to_jlocal]
-  return ighost_to_global, ighost_to_owner
-end
-
-# The given ids are assumed to be a sub-set of the lids
-function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer})
-  glo_to_loc = global_to_local(a)
-  loc_to_gho = local_to_ghost(a)
-  
-  # First pass: Allocate
-  i = 0
-  ghost_lids_touched = fill(false,ghost_length(a))
-  for gid in gids
-    lid = glo_to_loc[gid]
-    ghost_lid = loc_to_gho[lid]
-    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
-      ghost_lids_touched[ghost_lid] = true
-      i += 1
-    end
-  end
-  gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i)
-
-  # Second pass: fill 
-  i = 1
-  fill!(ghost_lids_touched,false)
-  for gid in gids
-    lid = glo_to_loc[gid]
-    ghost_lid = loc_to_gho[lid]
-    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
-      ghost_lids_touched[ghost_lid] = true
-      gids_ghost_lid_to_ghost_lid[i] = ghost_lid
-      i += 1
-    end
-  end
-
-  return gids_ghost_lid_to_ghost_lid
-end
-
-# Find the neighbours of partition1 trying 
-# to use those in partition2 as a hint 
-function _find_neighbours!(partition1, partition2)
-  partition2_snd, partition2_rcv = assembly_neighbors(partition2)
-  partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv)
-  return assembly_neighbors(partition1; neighbors=partition2_graph)
-end
-
-# to_global! & to_local! analogs, needed for dispatching in block assembly
-
-function to_local_indices!(I,ids::PRange;kwargs...)
-  map(to_local!,I,partition(ids))
-end
-
-function to_global_indices!(I,ids::PRange;kwargs...)
-  map(to_global!,I,partition(ids))
-end
-
-function get_gid_owners(I,ids::PRange;kwargs...)
-  map(I,partition(ids)) do I, indices 
-    glo_to_loc = global_to_local(indices) 
-    loc_to_own = local_to_owner(indices)
-    map(x->loc_to_own[glo_to_loc[x]], I)
-  end 
-end
-
-for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
-  @eval begin
-    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
-      map($f,I,ids)
-    end
-
-    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
-      @check ax ∈ [:rows,:cols]
-      block_ids = CartesianIndices(I)
-      map(block_ids) do id
-        i = id[1]; j = id[2];
-        if ax == :rows
-          $f(I[i,j],ids[i])
-        else
-          $f(I[i,j],ids[j])
-        end
-      end
-    end
-  end
-end
-
-# _setup_matrix : local matrices + global PRanges -> Global matrix
-
-function _setup_matrix(I,J,V,rows::PRange,cols::PRange)
-  assembled_rows = map(PArrays.remove_ghost,partition(rows))
-  psparse(
-    I,J,V,assembled_rows,partition(cols);
-    split_format=true,
-    assembled=true,
-    assemble=true,
-    indices=:local,
-    reuse=false,
-  ) |> fetch
-end
-
-function _setup_matrix(I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange})
-  block_ids  = CartesianIndices(I)
-  block_mats = map(block_ids) do id
-    i = id[1]; j = id[2];
-    _setup_matrix(I[i,j],J[i,j],V[i,j],rows[i],cols[j])
-  end
-  return mortar(block_mats)
-end
-
-# _assemble_coo! : local coo triplets + global PRange -> Global coo values
-
-function _assemble_coo!(I,J,V,rows::PRange;owners=nothing)
-  if isa(owners,Nothing)
-    PArrays.assemble_coo!(I,J,V,partition(rows))
-  else
-    assemble_coo_with_column_owner!(I,J,V,partition(rows),owners)
-  end
-end
-
-function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing)
-  block_ids = CartesianIndices(I)
-  map(block_ids) do id
-    i = id[1]; j = id[2];
-    if isa(owners,Nothing)
-      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i])
-    else
-      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j])
-    end
-  end
-end
-
-function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown)
-  """
-    Returns three JaggedArrays with the coo triplets
-    to be sent to the corresponding owner parts in parts_snd
-  """
-  function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner)
-    global_to_local_row = global_to_local(row_lids)
-    local_row_to_owner = local_to_owner(row_lids)
-    owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) ))
-    ptrs = zeros(Int32,length(parts_snd)+1)
-    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
-    for k in 1:length(k_gi)
-      gi = k_gi[k]
-      li = global_to_local_row[gi]
-      owner = local_row_to_owner[li]
-      if owner != part
-        ptrs[owner_to_i[owner]+1] +=1
-      end
-    end
-    PArrays.length_to_ptrs!(ptrs)
-    gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1)
-    gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1)
-    jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1)
-    v_snd_data = zeros(eltype(k_v),ptrs[end]-1)
-    for k in 1:length(k_gi)
-      gi = k_gi[k]
-      li = global_to_local_row[gi]
-      owner = local_row_to_owner[li]
-      if owner != part
-        gj = k_gj[k]
-        v = k_v[k]
-        p = ptrs[owner_to_i[owner]]
-        gi_snd_data[p] = gi
-        gj_snd_data[p] = gj
-        jo_snd_data[p] = k_jo[k]
-        v_snd_data[p]  = v
-        k_v[k] = zero(v)
-        ptrs[owner_to_i[owner]] += 1
-      end
-    end
-    PArrays.rewind_ptrs!(ptrs)
-    gi_snd = JaggedArray(gi_snd_data,ptrs)
-    gj_snd = JaggedArray(gj_snd_data,ptrs)
-    jo_snd = JaggedArray(jo_snd_data,ptrs)
-    v_snd = JaggedArray(v_snd_data,ptrs)
-    gi_snd, gj_snd, jo_snd, v_snd
-  end
-  """
-    Pushes to coo_entries_with_column_owner the tuples 
-    gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes
-  """
-  function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
-    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
-    current_n = length(k_gi)
-    new_n = current_n + length(gi_rcv.data)
-    resize!(k_gi,new_n)
-    resize!(k_gj,new_n)
-    resize!(k_jo,new_n)
-    resize!(k_v,new_n)
-    for p in 1:length(gi_rcv.data)
-        k_gi[current_n+p] = gi_rcv.data[p]
-        k_gj[current_n+p] = gj_rcv.data[p]
-        k_jo[current_n+p] = jo_rcv.data[p]
-        k_v[current_n+p] = v_rcv.data[p]
-    end
-  end
-  part = linear_indices(row_partition)
-  parts_snd, parts_rcv = assembly_neighbors(row_partition)
-  coo_entries_with_column_owner = map(tuple,I,J,Jown,V)
-  gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays
-  graph = ExchangeGraph(parts_snd,parts_rcv)
-  t1 = exchange(gi_snd,graph)
-  t2 = exchange(gj_snd,graph)
-  t3 = exchange(jo_snd,graph)
-  t4 = exchange(v_snd,graph)
-  @async begin
-      gi_rcv = fetch(t1)
-      gj_rcv = fetch(t2)
-      jo_rcv = fetch(t3)
-      v_rcv = fetch(t4)
-      map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
-      I,J,Jown,V
-  end
-end
-
-# dofs_gids_prange can be either test_gids or trial_gids
-# In the former case, gids is a vector of global test dof identifiers, while in the 
-# latter, a vector of global trial dof identifiers
-function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...)
-  if !ghost
-    _setup_prange_without_ghosts(dofs_gids_prange)
-  elseif isa(owners,Nothing)
-    _setup_prange_with_ghosts(dofs_gids_prange,gids)
-  else
-    _setup_prange_with_ghosts(dofs_gids_prange,gids,owners)
-  end
-end
-
-function _setup_prange(
-  dofs_gids_prange::AbstractVector{<:PRange},
-  gids::AbstractMatrix;
-  ax=:rows,ghost=true,owners=nothing
-)
-  @check ax ∈ (:rows,:cols)
-  block_ids = LinearIndices(dofs_gids_prange)
-  pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x))
-
-  gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange
-    gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id]
-    gids_union = pvcat(gids_slice)
-
-    owners_union = nothing
-    if !isnothing(owners)
-      owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id]
-      owners_union = pvcat(owners_slice)
-    end
-
-    return gids_union, owners_union
-  end |> tuple_of_arrays
-  
-  return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union)
-end
-
-# Create PRange for the rows of the linear system
-# without local ghost dofs as per required in the 
-# FullyAssembledRows() parallel assembly strategy 
-function _setup_prange_without_ghosts(dofs_gids_prange::PRange)
-  ngdofs = length(dofs_gids_prange)
-  indices = map(partition(dofs_gids_prange)) do dofs_indices 
-    owner = part_id(dofs_indices)
-    own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
-    ghost_indices = GhostIndices(ngdofs,Int64[],Int32[])
-    OwnAndGhostIndices(own_indices,ghost_indices)
-  end
-  return PRange(indices)
-end
-
-# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition
-# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system.
-# The precondition required for the consistency of any parallel assembly process in GridapDistributed 
-# is that each processor can determine locally with a single layer of ghost cells the global indices and associated 
-# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities 
-# (i.e., either cells or faces). 
-function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids)
-  ngdofs = length(dofs_gids_prange)
-  dofs_gids_partition = partition(dofs_gids_prange)
-
-  # Selected ghost ids -> dof PRange ghost ids
-  gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids)
-
-  # Selected ghost ids -> [global dof ids, owner processor ids]
-  gids_ghost_to_global, gids_ghost_to_owner = map(
-    find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition) |> tuple_of_arrays
-
-  return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner)
-end
-
-# Here we cannot assume that the sparse communication graph underlying 
-# trial_dofs_gids_partition is a superset of the one underlying indices.
-# Here we chould check whether it is included and call _find_neighbours!()
-# if this is the case. At present, we are not taking advantage of this, 
-# but let the parallel scalable algorithm to compute the reciprocal to do the job. 
-function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners)
-  ngdofs = length(dofs_gids_prange)
-  dofs_gids_partition = partition(dofs_gids_prange)
-
-  # Selected ghost ids -> [global dof ids, owner processor ids]
-  gids_ghost_to_global, gids_ghost_to_owner = map(
-    gids,owners,dofs_gids_partition) do gids, owners, indices
-    ghost_touched   = Dict{Int,Bool}()
-    ghost_to_global = Int64[] 
-    ghost_to_owner  = Int64[]
-    me = part_id(indices)
-    for (j,jo) in zip(gids,owners)
-      if jo != me
-        if !haskey(ghost_touched,j)
-          push!(ghost_to_global,j)
-          push!(ghost_to_owner,jo)
-          ghost_touched[j] = true
-        end
-      end
-    end
-    ghost_to_global, ghost_to_owner
-  end |> tuple_of_arrays
-
-  return _setup_prange_impl_(ngdofs,
-                             dofs_gids_partition,
-                             gids_ghost_to_global,
-                             gids_ghost_to_owner;
-                             discover_neighbours=false)
-end 
-
-function _setup_prange_impl_(ngdofs,
-                             dofs_gids_partition,
-                             gids_ghost_to_global,
-                             gids_ghost_to_owner;
-                             discover_neighbours=true)
-  indices = map(dofs_gids_partition, 
-                gids_ghost_to_global, 
-                gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner 
-    owner = part_id(dofs_indices)
-    own_indices   = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
-    ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner)
-    OwnAndGhostIndices(own_indices,ghost_indices)
-  end
-  if discover_neighbours
-    _find_neighbours!(indices,dofs_gids_partition)
-  end
-  return PRange(indices)
-end
diff --git a/src/Assembly.jl b/src/Assembly.jl
new file mode 100644
index 00000000..25e2411f
--- /dev/null
+++ b/src/Assembly.jl
@@ -0,0 +1,199 @@
+
+# Parallel assembly strategies
+
+struct Assembled <: FESpaces.AssemblyStrategy end
+struct SubAssembled <: FESpaces.AssemblyStrategy end
+struct LocallyAssembled <: FESpaces.AssemblyStrategy end
+
+# PSparseMatrix and PVector builders
+
+struct PSparseMatrixBuilder{T,B}
+  local_matrix_type::Type{T}
+  strategy::B
+end
+
+function Algebra.get_array_type(::PSparseMatrixBuilder{Tv}) where Tv
+  T = eltype(Tv)
+  return PSparseMatrix{T}
+end
+
+struct PVectorBuilder{T,B}
+  local_vector_type::Type{T}
+  strategy::B
+end
+
+function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
+  T = eltype(Tv)
+  return PVector{T}
+end
+
+# Distributed counters and allocators
+
+"""
+    DistributedCounter{S,T,N} <: GridapType
+
+Distributed N-dimensional counter, with local counters of type T.
+Follows assembly strategy S.
+"""
+struct DistributedCounter{S,T,N,A,B} <: GridapType
+  counters :: A
+  axes     :: B
+  strategy :: S
+  function DistributedCounter(
+    counters :: AbstractArray{T},
+    axes     :: NTuple{N,<:PRange},
+    strategy :: Algebra.AssemblyStrategy
+  ) where {T,N}
+    A, B, S = typeof(counters), typeof(axes), typeof(strategy)
+    new{S,T,N,A,B}(counters,axes,strategy)
+  end
+end
+
+Base.axes(a::DistributedCounter) = a.axes
+local_views(a::DistributedCounter) = a.counters
+
+const PVectorCounter{S,T} = DistributedCounter{S,T<:Algebra.ArrayCounter,1}
+Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
+
+const PSparseMatrixCounter{S,T} = DistributedCounter{S,T<:Algebra.CounterCOO,2}
+Algebra.LoopStyle(::Type{<:PSparseMatrixCounter}) = Loop()
+
+"""
+    DistributedAllocation{S,T,N} <: GridapType
+
+Distributed N-dimensional allocator, with local allocators of type T. 
+Follows assembly strategy S.
+"""
+struct DistributedAllocation{S,T,N,A,B} <: GridapType
+  allocs   :: A
+  axes     :: B
+  strategy :: S
+  function DistributedAllocation(
+    allocs   :: AbstractArray{T},
+    axes     :: NTuple{N,<:PRange},
+    strategy :: Algebra.AssemblyStrategy
+  ) where {T,N}
+    A, B, S = typeof(allocs), typeof(axes), typeof(strategy)
+    new{S,T,N,A,B}(allocs,axes,strategy)
+  end
+end
+
+Base.axes(a::DistributedAllocation) = a.axes
+local_views(a::DistributedAllocation) = a.allocs
+
+const PVectorAllocation{S,T} = DistributedAllocation{S,T,1}
+const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2}
+
+# PSparseMatrix assembly chain
+#
+#   1 - nz_counter(PSparseMatrixBuilder) -> PSparseMatrixCounter
+#   2 - nz_allocation(PSparseMatrixCounter) -> PSparseMatrixAllocation
+#   3 - create_from_nz(PSparseMatrixAllocation) -> PSparseMatrix
+
+function Algebra.nz_counter(builder::PSparseMatrixBuilder,axs::NTuple{2,<:PRange})
+  rows, cols = axs # test ids, trial ids
+  counters = map(partition(rows),partition(cols)) do rows,cols
+    axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols)))
+    Algebra.CounterCOO{A}(axs)
+  end
+  DistributedCounter(builder.par_strategy,counters,rows,cols)
+end
+
+function Algebra.nz_allocation(a::PSparseMatrixCounter)
+  allocs = map(nz_allocation,local_views(a))
+  DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+# PVector assembly chain:
+#
+#   1 - nz_counter(PVectorBuilder) -> PVectorCounter
+#   2 - nz_allocation(PVectorCounter) -> PVectorAllocation
+#   3 - create_from_nz(PVectorAllocation) -> PVector
+
+function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::Tuple{<:PRange}) where VT
+  rows, = axs
+  counters = map(partition(rows)) do rows
+    axs = (Base.OneTo(local_length(rows)),)
+    nz_counter(ArrayBuilder(VT),axs)
+  end
+  DistributedCounter(counters,axs,builder.strategy)
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled})
+  allocs = map(nz_allocation,local_views(a))
+  DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:Assembled})
+  values = map(nz_allocation,local_views(a))
+  allocs = map(TrackedArrayAllocation,values)
+  return DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+function Algebra.create_from_nz(a::PVector)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVector})
+  # This point MUST NEVER be reached. If reached there is an inconsistency
+  # in the parallel code in charge of vector assembly
+  @assert false
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{<:LocallyAssembled,<:AbstractVector})
+  rows = _setup_prange_without_ghosts(axes(a,1))
+  _rhs_callback(a,rows)
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
+  rows = _setup_prange_from_pvector_allocation(a)
+  b    = _rhs_callback(a,rows)
+  t2   = assemble!(b)
+  if t2 !== nothing
+    wait(t2)
+  end
+  return b
+end
+
+# PSystem assembly chain:
+#
+# When assembling a full system (matrix + vector), it is more efficient to 
+# overlap communications the assembly of the matrix and the vector.
+# Not only it is faster, but also necessary to ensure identical ghost indices 
+# in both the matrix and vector rows.
+# This is done by using the following specializations chain:
+
+function Arrays.nz_allocation(
+  a::PSparseMatrixCounter{<:Assembled},
+  b::PVectorCounter{<:Assembled}
+)
+  A = nz_allocation(a) # PSparseMatrixAllocation
+  B = nz_allocation(b) # PVectorAllocation{<:Assembled,<:TrackedArrayAllocation}
+  return A, B
+end
+
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:LocallyAssembled},
+  b::PVectorAllocation{<:LocallyAssembled,<:AbstractVector}
+)
+  function callback(rows)
+    _rhs_callback(b,rows)
+  end
+  A, B = _fa_create_from_nz_with_callback(callback,a)
+  return A, B
+end
+
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:Assembled},
+  b::PVectorAllocation{<:Assembled,<:TrackedArrayAllocation}
+)
+  function callback(rows)
+    _rhs_callback(b,rows)
+  end
+  function async_callback(b)
+    assemble!(b)
+  end
+  A, B = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
+  return A, B
+end
diff --git a/src/DistributedUtils.jl b/src/DistributedUtils.jl
deleted file mode 100644
index 0b34a0a1..00000000
--- a/src/DistributedUtils.jl
+++ /dev/null
@@ -1,44 +0,0 @@
-
-function permuted_variable_partition(
-  n_own::AbstractArray{<:Integer},
-  gids::AbstractArray{<:AbstractArray{<:Integer}},
-  owners::AbstractArray{<:AbstractArray{<:Integer}};
-  n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))),
-  start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
-)
-  ranks = linear_indices(n_own)
-  np = length(ranks)
-  map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners
-    n_local = length(gids)
-    n_ghost = n_local - n_own
-    perm = fill(zero(Int32),n_local)
-    ghost_gids = fill(zero(Int),n_ghost)
-    ghost_owners = fill(zero(Int32),n_ghost)
-
-    n_ghost = 0
-    for (lid,(gid,owner)) in enumerate(zip(gids,owners))
-      if owner == rank
-        perm[lid] = gid-start+1
-      else
-        n_ghost += 1
-        ghost_gids[n_ghost] = gid
-        ghost_owners[n_ghost] = owner
-        perm[lid] = n_own + n_ghost
-      end
-    end
-    @assert n_ghost == n_local - n_own
-
-    ghost = GhostIndices(n_global,ghost_gids,ghost_owners)
-    dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize(
-      CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost
-    )
-    permute_indices(dof_ids,perm)
-  end
-end
-
-function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T
-  ptrs = Vector{Int32}(undef,length(vv)+1)
-  Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv)
-  Arrays.length_to_ptrs!(ptrs)
-  return ptrs
-end
diff --git a/src/Geometry.jl b/src/Geometry.jl
index 4c788242..0f814b3c 100644
--- a/src/Geometry.jl
+++ b/src/Geometry.jl
@@ -582,7 +582,7 @@ function filter_cells_when_needed(
 end
 
 function filter_cells_when_needed(
-  portion::FullyAssembledRows,
+  portion::LocallyAssembled,
   cell_gids::AbstractLocalIndices,
   trian::Triangulation)
 
@@ -590,7 +590,7 @@ function filter_cells_when_needed(
 end
 
 function filter_cells_when_needed(
-  portion::SubAssembledRows,
+  portion::Assembled,
   cell_gids::AbstractLocalIndices,
   trian::Triangulation)
 
diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl
index 7edf2a48..b55ddfe8 100644
--- a/src/GridapDistributed.jl
+++ b/src/GridapDistributed.jl
@@ -42,12 +42,15 @@ export with_ghost, no_ghost
 
 export redistribute
 
-include("DistributedUtils.jl")
+include("GridapExtras.jl")
+include("PArraysExtras.jl")
 
 include("BlockPartitionedArrays.jl")
 
 include("Algebra.jl")
 
+include("Assembly.jl")
+
 include("Geometry.jl")
 
 include("CellData.jl")
diff --git a/src/GridapExtras.jl b/src/GridapExtras.jl
new file mode 100644
index 00000000..df5b1186
--- /dev/null
+++ b/src/GridapExtras.jl
@@ -0,0 +1,55 @@
+
+# Extensions to Gridap/Arrays/Tables.jl
+
+function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T
+  ptrs = Vector{Int32}(undef,length(vv)+1)
+  Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv)
+  Arrays.length_to_ptrs!(ptrs)
+  return ptrs
+end
+
+# New type of Vector allocation
+
+"""
+    TrackedArrayAllocation{T} <: GridapType
+
+Array that keeps track of which entries have been touched. For assembly purposes.
+"""
+struct TrackedArrayAllocation{T}
+  values  :: Vector{T}
+  touched :: Vector{Bool}
+end
+
+function TrackedArrayAllocation(values)
+  touched = fill(false,length(values))
+  TrackedArrayAllocation(values,touched)
+end
+
+Algebra.LoopStyle(::Type{<:TrackedArrayAllocation}) = Algebra.Loop()
+Algebra.create_from_nz(a::TrackedArrayAllocation) = a.values
+
+@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i)
+  if i > 0
+    a.touched[i] = true
+  end
+  nothing
+end
+@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v,i)
+  if i > 0
+    a.touched[i] = true
+    a.values[i] = combine(v,a.values[i])
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i)
+  for ie in i
+    Arrays.add_entry!(combine,a,nothing,ie)
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v,i)
+  for (ve,ie) in zip(v,i)
+    Arrays.add_entry!(combine,a,ve,ie)
+  end
+  nothing
+end
diff --git a/src/MultiField.jl b/src/MultiField.jl
index 689a0621..75e55d83 100644
--- a/src/MultiField.jl
+++ b/src/MultiField.jl
@@ -453,7 +453,7 @@ function FESpaces.SparseMatrixAssembler(
   local_vec_type,
   trial::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle{NB,SB,P}},
   test::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle{NB,SB,P}},
-  par_strategy=SubAssembledRows()) where {NB,SB,P}
+  par_strategy=Assembled()) where {NB,SB,P}
 
   block_idx  = CartesianIndices((NB,NB))
   block_rows = blocks(test.gids)
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
new file mode 100644
index 00000000..f4a5ae2e
--- /dev/null
+++ b/src/PArraysExtras.jl
@@ -0,0 +1,210 @@
+
+"""
+    permuted_variable_partition(n_own,gids,owners; n_global, start)
+
+Create indices which are a permuted version of a variable_partition.
+The advantage of this wrt the `LocalIndices` type, is that we can compute 
+dof owners with minimal communications, since we only need the size of the blocks.
+
+NOTE: This is the type for our FESpace dof_ids.
+"""
+function permuted_variable_partition(
+  n_own::AbstractArray{<:Integer},
+  gids::AbstractArray{<:AbstractArray{<:Integer}},
+  owners::AbstractArray{<:AbstractArray{<:Integer}};
+  n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))),
+  start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
+)
+  ranks = linear_indices(n_own)
+  np = length(ranks)
+  map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners
+    n_local = length(gids)
+    n_ghost = n_local - n_own
+    perm = fill(zero(Int32),n_local)
+    ghost_gids = fill(zero(Int),n_ghost)
+    ghost_owners = fill(zero(Int32),n_ghost)
+
+    n_ghost = 0
+    for (lid,(gid,owner)) in enumerate(zip(gids,owners))
+      if owner == rank
+        perm[lid] = gid-start+1
+      else
+        n_ghost += 1
+        ghost_gids[n_ghost] = gid
+        ghost_owners[n_ghost] = owner
+        perm[lid] = n_own + n_ghost
+      end
+    end
+    @assert n_ghost == n_local - n_own
+
+    ghost = GhostIndices(n_global,ghost_gids,ghost_owners)
+    dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize(
+      CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost
+    )
+    permute_indices(dof_ids,perm)
+  end
+end
+
+# Linear algebra
+
+function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
+  @check partition(axes(x,1)) === partition(axes(y,1))
+  map(partition(x),partition(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  consistent!(y) |> wait
+  return y
+end
+
+function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
+  map(blocks(x),blocks(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  return y
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::PSparseMatrix, B::PSparseMatrix;
+  check::Bool=true
+)
+# We should definitely check here that the index partitions are the same. 
+# However: Because the different matrices are assembled separately, the objects are not the 
+# same (i.e can't use ===). Checking the index partitions would then be costly...
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
+  map(partition(A),partition(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::BlockPMatrix, B::BlockPMatrix;
+  check::Bool=true
+)
+  map(blocks(A),blocks(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+# Array of PArrays -> PArray of Arrays 
+# TODO: I think this is now implemented in PartitionedArrays.jl (check)
+function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      PartitionedArrays.getany(aj)
+    end
+  end
+end
+
+function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      aj.items[i]
+    end
+  end
+end
+
+# This type is required because MPIArray from PArrays 
+# cannot be instantiated with a NULL communicator
+struct MPIVoidVector{T} <: AbstractVector{T}
+  comm::MPI.Comm
+  function MPIVoidVector(::Type{T}) where {T}
+    new{T}(MPI.COMM_NULL)
+  end
+end
+
+Base.size(a::MPIVoidVector) = (0,)
+Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
+function Base.getindex(a::MPIVoidVector,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.setindex!(a::MPIVoidVector,v,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
+  println(io,"MPIVoidVector")
+end
+
+# Communication extras, subpartitioning extras
+
+function num_parts(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    nparts = MPI.Comm_size(comm)
+  else
+    nparts = -1
+  end
+  nparts
+end
+@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
+@inline num_parts(comm::DebugArray) = length(comm.items)
+@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
+
+function get_part_id(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    id = MPI.Comm_rank(comm)+1
+  else
+    id = -1
+  end
+  id
+end
+@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
+@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
+
+"""
+    i_am_in(comm::MPIArray)
+    i_am_in(comm::DebugArray)
+  
+  Returns `true` if the processor is part of the subcommunicator `comm`.
+"""
+function i_am_in(comm::MPI.Comm)
+  get_part_id(comm) >=0
+end
+@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
+@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
+@inline i_am_in(comm::DebugArray) = true
+
+function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
+  x_new = map(new_parts) do p
+    if isa(x,MPIArray)
+      PartitionedArrays.getany(x)
+    elseif isa(x,DebugArray) && (p <= length(x.items))
+      x.items[p]
+    else
+      default
+    end
+  end
+  return x_new
+end
+
+function generate_subparts(parts::MPIArray,new_comm_size)
+  root_comm = parts.comm
+  root_size = MPI.Comm_size(root_comm)
+  rank = MPI.Comm_rank(root_comm)
+
+  @static if isdefined(MPI,:MPI_UNDEFINED)
+    mpi_undefined = MPI.MPI_UNDEFINED[]
+  else
+    mpi_undefined = MPI.API.MPI_UNDEFINED[]
+  end
+  
+  if root_size == new_comm_size
+    return parts
+  else
+    if rank < new_comm_size
+      comm = MPI.Comm_split(root_comm,0,0)
+      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
+    else
+      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
+      return MPIVoidVector(eltype(parts))
+    end
+  end
+end
+
+function generate_subparts(parts::DebugArray,new_comm_size)
+  DebugArray(LinearIndices((new_comm_size,)))
+end
+
diff --git a/src/_Algebra_old.jl b/src/_Algebra_old.jl
new file mode 100644
index 00000000..e5aefd6a
--- /dev/null
+++ b/src/_Algebra_old.jl
@@ -0,0 +1,1238 @@
+
+# Vector allocation 
+
+function Algebra.allocate_vector(::Type{<:PVector{V}},ids::PRange) where {V}
+  PVector{V}(undef,partition(ids))
+end
+
+function Algebra.allocate_vector(::Type{<:BlockPVector{V}},ids::BlockPRange) where {V}
+  BlockPVector{V}(undef,ids)
+end
+
+function Algebra.allocate_in_range(matrix::PSparseMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_range(PVector{V},matrix)
+end
+
+function Algebra.allocate_in_domain(matrix::PSparseMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_domain(PVector{V},matrix)
+end
+
+function Algebra.allocate_in_range(matrix::BlockPMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_range(BlockPVector{V},matrix)
+end
+
+function Algebra.allocate_in_domain(matrix::BlockPMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_domain(BlockPVector{V},matrix)
+end
+
+# PartitionedArrays extras
+
+function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
+  @check partition(axes(x,1)) === partition(axes(y,1))
+  map(partition(x),partition(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  consistent!(y) |> wait
+  return y
+end
+
+function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
+  map(blocks(x),blocks(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  return y
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::PSparseMatrix, B::PSparseMatrix;
+  check::Bool=true
+)
+# We should definitely check here that the index partitions are the same. 
+# However: Because the different matrices are assembled separately, the objects are not the 
+# same (i.e can't use ===). Checking the index partitions would then be costly...
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
+  map(partition(A),partition(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::BlockPMatrix, B::BlockPMatrix;
+  check::Bool=true
+)
+  map(blocks(A),blocks(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+# TODO: Can we remove this?
+# This might go to Gridap in the future. We keep it here for the moment.
+function change_axes(a::Algebra.ArrayCounter,axes)
+  @notimplemented
+end
+
+# This might go to Gridap in the future. We keep it here for the moment.
+function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
+  b = Algebra.CounterCOO{T}(axes)
+  b.nnz = a.nnz
+  b
+end
+
+# This might go to Gridap in the future. We keep it here for the moment.
+function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
+  counter = change_axes(a.counter,axes)
+  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
+end
+
+# Array of PArrays -> PArray of Arrays 
+function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      PartitionedArrays.getany(aj)
+    end
+  end
+end
+
+function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      aj.items[i]
+    end
+  end
+end
+
+# This type is required because MPIArray from PArrays 
+# cannot be instantiated with a NULL communicator
+struct MPIVoidVector{T} <: AbstractVector{T}
+  comm::MPI.Comm
+  function MPIVoidVector(::Type{T}) where {T}
+    new{T}(MPI.COMM_NULL)
+  end
+end
+
+Base.size(a::MPIVoidVector) = (0,)
+Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
+function Base.getindex(a::MPIVoidVector,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.setindex!(a::MPIVoidVector,v,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
+  println(io,"MPIVoidVector")
+end
+
+function num_parts(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    nparts = MPI.Comm_size(comm)
+  else
+    nparts = -1
+  end
+  nparts
+end
+@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
+@inline num_parts(comm::DebugArray) = length(comm.items)
+@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
+
+function get_part_id(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    id = MPI.Comm_rank(comm)+1
+  else
+    id = -1
+  end
+  id
+end
+@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
+@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
+
+"""
+    i_am_in(comm::MPIArray)
+    i_am_in(comm::DebugArray)
+  
+  Returns `true` if the processor is part of the subcommunicator `comm`.
+"""
+function i_am_in(comm::MPI.Comm)
+  get_part_id(comm) >=0
+end
+@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
+@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
+@inline i_am_in(comm::DebugArray) = true
+
+function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
+  x_new = map(new_parts) do p
+    if isa(x,MPIArray)
+      PartitionedArrays.getany(x)
+    elseif isa(x,DebugArray) && (p <= length(x.items))
+      x.items[p]
+    else
+      default
+    end
+  end
+  return x_new
+end
+
+function generate_subparts(parts::MPIArray,new_comm_size)
+  root_comm = parts.comm
+  root_size = MPI.Comm_size(root_comm)
+  rank = MPI.Comm_rank(root_comm)
+
+  @static if isdefined(MPI,:MPI_UNDEFINED)
+    mpi_undefined = MPI.MPI_UNDEFINED[]
+  else
+    mpi_undefined = MPI.API.MPI_UNDEFINED[]
+  end
+  
+  if root_size == new_comm_size
+    return parts
+  else
+    if rank < new_comm_size
+      comm = MPI.Comm_split(root_comm,0,0)
+      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
+    else
+      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
+      return MPIVoidVector(eltype(parts))
+    end
+  end
+end
+
+function generate_subparts(parts::DebugArray,new_comm_size)
+  DebugArray(LinearIndices((new_comm_size,)))
+end
+
+# local_views
+
+function local_views(a)
+  @abstractmethod
+end
+
+function get_parts(a)
+  return linear_indices(local_views(a))
+end
+
+function local_views(a::AbstractVector,rows)
+  @notimplemented
+end
+
+function local_views(a::AbstractMatrix,rows,cols)
+  @notimplemented
+end
+
+local_views(a::AbstractArray) = a
+local_views(a::PRange) = partition(a)
+local_views(a::PVector) = partition(a)
+local_views(a::PSparseMatrix) = partition(a)
+
+function local_views(a::BlockPRange)
+  map(blocks(a)) do a
+    local_views(a)
+  end |> to_parray_of_arrays
+end
+
+function local_views(a::BlockPArray)
+  vals = map(blocks(a)) do a
+    local_views(a)
+  end |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+# change_ghost
+
+function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T
+  same_partition = (a.index_partition === partition(ids))
+  a_new = same_partition ? a : change_ghost(T,a,ids)
+  if make_consistent && (!same_partition || !is_consistent)
+    consistent!(a_new) |> wait
+  end
+  return a_new
+end
+
+function change_ghost(::Type{<:AbstractVector},a::PVector,ids::PRange)
+  a_new = similar(a,eltype(a),(ids,))
+  # Equivalent to copy!(a_new,a) but does not check that owned indices match
+  map(copy!,own_values(a_new),own_values(a))
+  return a_new
+end
+
+function change_ghost(::Type{<:OwnAndGhostVectors},a::PVector,ids::PRange)
+  values = map(own_values(a),partition(ids)) do own_vals,ids
+    ghost_vals = fill(zero(eltype(a)),ghost_length(ids))
+    perm = PartitionedArrays.local_permutation(ids)
+    OwnAndGhostVectors(own_vals,ghost_vals,perm)
+  end
+  return PVector(values,partition(ids))
+end
+
+function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_consistent=false)
+  vals = map(blocks(a),blocks(ids)) do a, ids
+    change_ghost(a,ids;is_consistent=is_consistent,make_consistent=make_consistent)
+  end
+  return BlockPVector(vals,ids)
+end
+
+# TODO: Can we remove this?
+# This function computes a mapping among the local identifiers of a and b
+# for which the corresponding global identifiers are both in a and b. 
+# Note that the haskey check is necessary because in the general case 
+# there might be gids in b which are not present in a
+function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
+  a_local_to_b_local = fill(Int32(-1),local_length(a))
+  b_local_to_global  = local_to_global(b)
+  a_global_to_local  = global_to_local(a)
+  for blid in 1:local_length(b)
+    gid = b_local_to_global[blid]
+    if a_global_to_local[gid] != zero(eltype(a_global_to_local))
+      alid = a_global_to_local[gid]
+      a_local_to_b_local[alid] = blid
+    end  
+  end
+  a_local_to_b_local
+end
+
+# TODO: Can we remove this?
+# This type is required in order to be able to access the local portion 
+# of distributed sparse matrices and vectors using local indices from the 
+# distributed test and trial spaces
+struct LocalView{T,N,A,B} <:AbstractArray{T,N}
+  plids_to_value::A
+  d_to_lid_to_plid::B
+  local_size::NTuple{N,Int}
+  function LocalView(
+    plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N}
+    A = typeof(plids_to_value)
+    B = typeof(d_to_lid_to_plid)
+    local_size = map(length,d_to_lid_to_plid)
+    new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size)
+  end
+end
+
+Base.size(a::LocalView) = a.local_size
+Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian()
+function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
+  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
+  if all(i->i>0,plids)
+    a.plids_to_value[plids...]
+  else
+    zero(T)
+  end
+end
+function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N}
+  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
+  @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion"
+  a.plids_to_value[plids...] = v
+end
+
+function _lid_to_plid(lid,lid_to_plid)
+  plid = lid_to_plid[lid]
+  plid
+end
+
+function local_views(a::PVector,new_rows::PRange)
+  old_rows = axes(a,1)
+  if partition(old_rows) === partition(new_rows)
+    partition(a)
+  else
+    map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows
+      LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),))
+    end
+  end
+end
+
+function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange)
+  old_rows, old_cols = axes(a)
+  if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) )
+    partition(a)
+  else
+    map(partition(a),
+        partition(old_rows),partition(old_cols),
+        partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols
+      rl2lmap = find_local_to_local_map(new_rows,old_rows)
+      cl2lmap = find_local_to_local_map(new_cols,old_cols)
+      LocalView(matrix_partition,(rl2lmap,cl2lmap))
+    end
+  end
+end
+
+function local_views(a::BlockPVector,new_rows::BlockPRange)
+  vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange)
+  vals = map(CartesianIndices(blocksize(a))) do I
+    local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I])
+  end |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+# PSparseMatrix assembly
+
+struct Assembled <: FESpaces.AssemblyStrategy end
+struct SubAssembled <: FESpaces.AssemblyStrategy end
+struct LocallyAssembled <: FESpaces.AssemblyStrategy end
+
+struct PSparseMatrixBuilderCOO{T,B}
+  local_matrix_type::Type{T}
+  par_strategy::B
+end
+
+function Algebra.nz_counter(
+  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}
+) where A
+  rows, cols = axs # test ids, trial ids
+  counters = map(partition(rows),partition(cols)) do r,c
+    axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c)))
+    Algebra.CounterCOO{A}(axs)
+  end
+  DistributedCounterCOO(builder.par_strategy,counters,rows,cols)
+end
+
+function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv
+  T = eltype(Tv)
+  return PSparseMatrix{T}
+end
+
+
+"""
+"""
+struct DistributedCounterCOO{A,B,C,D} <: GridapType
+  par_strategy::A
+  counters    ::B
+  test_gids   ::C
+  trial_gids  ::D
+  function DistributedCounterCOO(
+    par_strategy,
+    counters::AbstractArray{<:Algebra.CounterCOO},
+    test_gids::PRange,
+    trial_gids::PRange
+  )
+    A = typeof(par_strategy)
+    B = typeof(counters)
+    C = typeof(test_gids)
+    D = typeof(trial_gids)
+    new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids)
+  end
+end
+
+function local_views(a::DistributedCounterCOO)
+  return a.counters
+end
+
+function local_views(a::DistributedCounterCOO,test_gids,trial_gids)
+  @check PArrays.matching_local_indices(test_gids,a.test_gids)
+  @check PArrays.matching_local_indices(trial_gids,a.trial_gids)
+  return a.counters
+end
+
+function Algebra.nz_allocation(a::DistributedCounterCOO)
+  allocs = map(nz_allocation,a.counters) # Array{AllocationCOO}
+  DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids)
+end
+
+struct DistributedAllocationCOO{A,B,C,D} <: GridapType
+  par_strategy::A
+  allocs::B
+  test_gids::C
+  trial_gids::D
+  function DistributedAllocationCOO(
+    par_strategy,
+    allocs::AbstractArray{<:Algebra.AllocationCOO},
+    test_gids::PRange,
+    trial_gids::PRange)
+    A = typeof(par_strategy)
+    B = typeof(allocs)
+    C = typeof(test_gids)
+    D = typeof(trial_gids)
+    new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids)
+  end
+end
+
+function change_axes(
+  a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
+  axes::Tuple{<:PRange,<:PRange}
+) where {A,B}
+  local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols
+    (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols)))
+  end
+  allocs = map(change_axes,a.allocs,local_axes)
+  DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2])
+end
+
+function change_axes(
+  a::MatrixBlock{<:DistributedAllocationCOO},
+  axes::Tuple{<:Vector,<:Vector}
+)
+  block_ids  = CartesianIndices(a.array)
+  rows, cols = axes
+  array = map(block_ids) do I
+    change_axes(a[I],(rows[I[1]],cols[I[2]]))
+  end
+  return ArrayBlock(array,a.touched)
+end
+
+function local_views(a::DistributedAllocationCOO)
+  a.allocs
+end
+
+function local_views(a::DistributedAllocationCOO,test_gids,trial_gids)
+  @check test_gids === a.test_gids
+  @check trial_gids === a.trial_gids
+  a.allocs
+end
+
+function local_views(a::MatrixBlock{<:DistributedAllocationCOO})
+  array = map(local_views,a.array) |> to_parray_of_arrays
+  return map(ai -> ArrayBlock(ai,a.touched),array)
+end
+
+function get_allocations(a::DistributedAllocationCOO)
+  I,J,V = map(local_views(a)) do alloc
+    alloc.I, alloc.J, alloc.V
+  end |> tuple_of_arrays
+  return I,J,V
+end
+
+function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO})
+  tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays
+  return tuple_of_array_of_parrays
+end
+
+get_test_gids(a::DistributedAllocationCOO)  = a.test_gids
+get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids
+get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO})  = map(get_test_gids,diag(a.array))
+get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array))
+
+function Algebra.create_from_nz(a::PSparseMatrix)
+  # For LocallyAssembled the underlying Exchanger should
+  # not have ghost layer making assemble! do nothing (TODO check)
+  assemble!(a) |> wait
+  a
+end
+
+function Algebra.create_from_nz(a::DistributedAllocationCOO{<:LocallyAssembled})
+  f(x) = nothing
+  A, = _fa_create_from_nz_with_callback(f,a)
+  return A
+end
+
+function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:LocallyAssembled}})
+  f(x) = nothing
+  A, = _fa_create_from_nz_with_callback(f,a)
+  return A
+end
+
+function _fa_create_from_nz_with_callback(callback,a)
+
+  # Recover some data
+  I,J,V = get_allocations(a)
+  test_gids  = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
+
+  rows = _setup_prange(test_gids,I;ghost=false,ax=:rows)
+  b = callback(rows)
+
+  # convert I and J to global dof ids
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
+
+  # Create the range for cols
+  cols = _setup_prange(trial_gids,J;ax=:cols)
+
+  # Convert again I,J to local numeration
+  to_local_indices!(I,rows;ax=:rows)
+  to_local_indices!(J,cols;ax=:cols)
+
+  A = _setup_matrix(I,J,V,rows,cols)
+  return A, b
+end
+
+function Algebra.create_from_nz(a::DistributedAllocationCOO{<:Assembled})
+  f(x) = nothing
+  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
+  return A
+end
+
+function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:Assembled}})
+  f(x) = nothing
+  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
+  return A
+end
+
+function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
+  # Recover some data
+  I,J,V = get_allocations(a)
+  test_gids = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
+
+  # convert I and J to global dof ids
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
+
+  # Create the Prange for the rows
+  rows = _setup_prange(test_gids,I;ax=:rows)
+  
+  # Move (I,J,V) triplets to the owner process of each row I.
+  # The triplets are accompanyed which Jo which is the process column owner
+  Jo = get_gid_owners(J,trial_gids;ax=:cols)
+  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
+
+  # Here we can overlap computations
+  # This is a good place to overlap since
+  # sending the matrix rows is a lot of data
+  if !isa(b,Nothing)
+    bprange=_setup_prange_from_pvector_allocation(b)
+    b = callback(bprange)
+  end
+
+  # Wait the transfer to finish
+  wait(t)
+
+  # Create the Prange for the cols
+  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
+
+  # Overlap rhs communications with CSC compression
+  t2 = async_callback(b)
+
+  # Convert again I,J to local numeration
+  to_local_indices!(I,rows;ax=:rows)
+  to_local_indices!(J,cols;ax=:cols)
+
+  A = _setup_matrix(a,I,J,V,rows,cols)
+
+  # Wait the transfer to finish
+  if !isa(t2,Nothing)
+    wait(t2)
+  end
+  
+  return A, b
+end
+
+
+# PVector assembly 
+
+struct PVectorBuilder{T,B}
+  local_vector_type::Type{T}
+  par_strategy::B
+end
+
+function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange})
+  T = builder.local_vector_type
+  rows, = axs
+  counters = map(partition(rows)) do rows
+    axs = (Base.OneTo(local_length(rows)),)
+    nz_counter(ArrayBuilder(T),axs)
+  end
+  PVectorCounter(builder.par_strategy,counters,rows)
+end
+
+function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
+  T = eltype(Tv)
+  return PVector{T}
+end
+
+struct PVectorCounter{A,B,C}
+  par_strategy::A
+  counters::B
+  test_gids::C
+end
+
+Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
+
+function local_views(a::PVectorCounter)
+  a.counters
+end
+
+function local_views(a::PVectorCounter,rows)
+  @check rows === a.test_gids
+  a.counters
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled})
+  dofs = a.test_gids
+  values = map(nz_allocation,a.counters)
+  PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs)
+end
+
+struct PVectorAllocationTrackOnlyValues{A,B,C}
+  par_strategy::A
+  values::B
+  test_gids::C
+end
+
+function local_views(a::PVectorAllocationTrackOnlyValues)
+  a.values
+end
+
+function local_views(a::PVectorAllocationTrackOnlyValues,rows)
+  @check rows === a.test_gids
+  a.values
+end
+
+function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:LocallyAssembled})
+  rows = _setup_prange_without_ghosts(a.test_gids)
+  _rhs_callback(a,rows)
+end
+
+function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:Assembled})
+  # This point MUST NEVER be reached. If reached there is an inconsistency
+  # in the parallel code in charge of vector assembly
+  @assert false
+end
+
+function _rhs_callback(row_partitioned_vector_partition,rows)
+  # The ghost values in row_partitioned_vector_partition are 
+  # aligned with the FESpace but not with the ghost values in the rows of A
+  b_fespace = PVector(row_partitioned_vector_partition.values,
+                      partition(row_partitioned_vector_partition.test_gids))
+
+  # This one is aligned with the rows of A
+  b = similar(b_fespace,eltype(b_fespace),(rows,))
+
+  # First transfer owned values
+  b .= b_fespace
+
+  # Now transfer ghost
+  function transfer_ghost(b,b_fespace,ids,ids_fespace)
+    num_ghosts_vec = ghost_length(ids)
+    gho_to_loc_vec = ghost_to_local(ids)
+    loc_to_glo_vec = local_to_global(ids)
+    gid_to_lid_fe  = global_to_local(ids_fespace)
+    for ghost_lid_vec in 1:num_ghosts_vec
+      lid_vec     = gho_to_loc_vec[ghost_lid_vec]
+      gid         = loc_to_glo_vec[lid_vec]
+      lid_fespace = gid_to_lid_fe[gid]
+      b[lid_vec] = b_fespace[lid_fespace]
+    end
+  end
+  map(
+    transfer_ghost,
+    partition(b),
+    partition(b_fespace),
+    b.index_partition,
+    b_fespace.index_partition
+  )
+
+  return b
+end
+
+function Algebra.create_from_nz(a::PVector)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(
+  a::DistributedAllocationCOO{<:LocallyAssembled},
+  c_fespace::PVectorAllocationTrackOnlyValues{<:LocallyAssembled}
+)
+
+  function callback(rows)
+    _rhs_callback(c_fespace,rows)
+  end
+
+  A,b = _fa_create_from_nz_with_callback(callback,a)
+  return A,b
+end
+
+struct PVectorAllocationTrackTouchedAndValues{A,B,C}
+  allocations::A
+  values::B
+  test_gids::C
+end
+
+function Algebra.create_from_nz(
+  a::DistributedAllocationCOO{<:Assembled},
+  b::PVectorAllocationTrackTouchedAndValues
+)
+
+  function callback(rows)
+    _rhs_callback(b,rows)
+  end
+
+  function async_callback(b)
+    # now we can assemble contributions
+    assemble!(b)
+  end
+
+  A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
+  return A,b
+end
+
+struct ArrayAllocationTrackTouchedAndValues{A}
+  touched::Vector{Bool}
+  values::A
+end
+
+Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop()
+
+function local_views(a::PVectorAllocationTrackTouchedAndValues,rows)
+  @check PArrays.matching_local_indices(rows, a.test_gids)
+  a.allocations
+end
+
+@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
+  @notimplemented
+end
+@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
+  if i > 0
+    a.touched[i]=true
+    if !isnothing(v)
+      a.values[i]=c(v,a.values[i])
+    end
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
+  @notimplemented
+end
+@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
+  if !isa(v,Nothing)
+    for (ve,ie) in zip(v,i)
+      Arrays.add_entry!(c,a,ve,ie)
+    end
+  else
+    for ie in i
+      Arrays.add_entry!(c,a,nothing,ie)
+    end
+  end
+  nothing
+end
+
+
+function _setup_touched_and_allocations_arrays(values)
+  touched = map(values) do values
+    fill!(Vector{Bool}(undef,length(values)),false)
+  end
+  allocations = map(values,touched) do values,touched
+   ArrayAllocationTrackTouchedAndValues(touched,values)
+  end
+  touched, allocations
+end
+
+function Arrays.nz_allocation(
+  a::DistributedCounterCOO{<:Assembled},
+  b::PVectorCounter{<:Assembled}
+)
+  A      = nz_allocation(a)
+  dofs   = b.test_gids
+  values = map(nz_allocation,b.counters)
+  touched, allocations = _setup_touched_and_allocations_arrays(values)
+  B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
+  return A, B
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:Assembled})
+  dofs = a.test_gids
+  values = map(nz_allocation,a.counters)
+  touched,allocations=_setup_touched_and_allocations_arrays(values)
+  return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
+end
+
+function local_views(a::PVectorAllocationTrackTouchedAndValues)
+  a.allocations
+end
+
+function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues)
+
+  # Find the ghost rows
+  allocations = local_views(a.allocations)
+  indices = partition(a.test_gids)
+  I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices
+    dofs_lids_touched = findall(allocation.touched)
+    loc_to_gho = local_to_ghost(indices)
+    n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched)
+    I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids)
+    cur = 1
+    for lid in dofs_lids_touched
+      dof_lid = loc_to_gho[lid]
+      if dof_lid != 0
+        I_ghost_lids[cur] = dof_lid
+        cur = cur+1
+      end
+    end
+    I_ghost_lids
+  end
+
+  ghost_to_global, ghost_to_owner = map(
+    find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays
+
+  ngids = length(a.test_gids)
+  _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner)
+end
+
+function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues)
+  rows = _setup_prange_from_pvector_allocation(a)
+  b    = _rhs_callback(a,rows)
+  t2   = assemble!(b)
+  # Wait the transfer to finish
+  if t2 !== nothing
+    wait(t2)
+  end
+  return b
+end
+
+# Common Assembly Utilities
+function first_gdof_from_ids(ids)
+  if own_length(ids) == 0
+    return 1
+  end
+  lid_to_gid   = local_to_global(ids) 
+  owned_to_lid = own_to_local(ids)
+  return Int(lid_to_gid[first(owned_to_lid)])
+end
+
+function find_gid_and_owner(ighost_to_jghost,jindices)
+  jghost_to_local  = ghost_to_local(jindices)
+  jlocal_to_global = local_to_global(jindices)
+  jlocal_to_owner  = local_to_owner(jindices)
+  ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost))
+
+  ighost_to_global = jlocal_to_global[ighost_to_jlocal]
+  ighost_to_owner  = jlocal_to_owner[ighost_to_jlocal]
+  return ighost_to_global, ighost_to_owner
+end
+
+# The given ids are assumed to be a sub-set of the lids
+function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer})
+  glo_to_loc = global_to_local(a)
+  loc_to_gho = local_to_ghost(a)
+  
+  # First pass: Allocate
+  i = 0
+  ghost_lids_touched = fill(false,ghost_length(a))
+  for gid in gids
+    lid = glo_to_loc[gid]
+    ghost_lid = loc_to_gho[lid]
+    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
+      ghost_lids_touched[ghost_lid] = true
+      i += 1
+    end
+  end
+  gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i)
+
+  # Second pass: fill 
+  i = 1
+  fill!(ghost_lids_touched,false)
+  for gid in gids
+    lid = glo_to_loc[gid]
+    ghost_lid = loc_to_gho[lid]
+    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
+      ghost_lids_touched[ghost_lid] = true
+      gids_ghost_lid_to_ghost_lid[i] = ghost_lid
+      i += 1
+    end
+  end
+
+  return gids_ghost_lid_to_ghost_lid
+end
+
+# Find the neighbours of partition1 trying 
+# to use those in partition2 as a hint 
+function _find_neighbours!(partition1, partition2)
+  partition2_snd, partition2_rcv = assembly_neighbors(partition2)
+  partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv)
+  return assembly_neighbors(partition1; neighbors=partition2_graph)
+end
+
+# to_global! & to_local! analogs, needed for dispatching in block assembly
+
+function to_local_indices!(I,ids::PRange;kwargs...)
+  map(to_local!,I,partition(ids))
+end
+
+function to_global_indices!(I,ids::PRange;kwargs...)
+  map(to_global!,I,partition(ids))
+end
+
+function get_gid_owners(I,ids::PRange;kwargs...)
+  map(I,partition(ids)) do I, indices 
+    glo_to_loc = global_to_local(indices) 
+    loc_to_own = local_to_owner(indices)
+    map(x->loc_to_own[glo_to_loc[x]], I)
+  end 
+end
+
+for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
+  @eval begin
+    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
+      map($f,I,ids)
+    end
+
+    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
+      @check ax ∈ [:rows,:cols]
+      block_ids = CartesianIndices(I)
+      map(block_ids) do id
+        i = id[1]; j = id[2];
+        if ax == :rows
+          $f(I[i,j],ids[i])
+        else
+          $f(I[i,j],ids[j])
+        end
+      end
+    end
+  end
+end
+
+
+function _setup_matrix(
+  a,I,J,V,rows::PRange,cols::PRange
+)
+  _rows = PRange(map(remove_ghost,partition(rows)))
+  asys = change_axes(a,(_rows,cols))
+  values = map(create_from_nz,local_views(asys))
+  PSparseMatrix(values,partition(_rows),partition(cols),true)
+end
+
+function _setup_matrix(a,I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange})
+  block_ids  = CartesianIndices(I)
+  block_mats = map(block_ids) do id
+    i = id[1]; j = id[2];
+    _setup_matrix(a[i,j],I[i,j],J[i,j],V[i,j],rows[i],cols[j])
+  end
+  return mortar(block_mats)
+end
+
+# _assemble_coo! : local coo triplets + global PRange -> Global coo values
+
+function _assemble_coo!(I,J,V,rows::PRange;owners=nothing)
+  if isa(owners,Nothing)
+    PArrays.assemble_coo!(I,J,V,partition(rows))
+  else
+    assemble_coo_with_column_owner!(I,J,V,partition(rows),owners)
+  end
+end
+
+function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing)
+  block_ids = CartesianIndices(I)
+  map(block_ids) do id
+    i = id[1]; j = id[2];
+    if isa(owners,Nothing)
+      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i])
+    else
+      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j])
+    end
+  end
+end
+
+function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown)
+  """
+    Returns three JaggedArrays with the coo triplets
+    to be sent to the corresponding owner parts in parts_snd
+  """
+  function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner)
+    global_to_local_row = global_to_local(row_lids)
+    local_row_to_owner = local_to_owner(row_lids)
+    owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) ))
+    ptrs = zeros(Int32,length(parts_snd)+1)
+    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
+    for k in 1:length(k_gi)
+      gi = k_gi[k]
+      li = global_to_local_row[gi]
+      owner = local_row_to_owner[li]
+      if owner != part
+        ptrs[owner_to_i[owner]+1] +=1
+      end
+    end
+    PArrays.length_to_ptrs!(ptrs)
+    gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1)
+    gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1)
+    jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1)
+    v_snd_data = zeros(eltype(k_v),ptrs[end]-1)
+    for k in 1:length(k_gi)
+      gi = k_gi[k]
+      li = global_to_local_row[gi]
+      owner = local_row_to_owner[li]
+      if owner != part
+        gj = k_gj[k]
+        v = k_v[k]
+        p = ptrs[owner_to_i[owner]]
+        gi_snd_data[p] = gi
+        gj_snd_data[p] = gj
+        jo_snd_data[p] = k_jo[k]
+        v_snd_data[p]  = v
+        k_v[k] = zero(v)
+        ptrs[owner_to_i[owner]] += 1
+      end
+    end
+    PArrays.rewind_ptrs!(ptrs)
+    gi_snd = JaggedArray(gi_snd_data,ptrs)
+    gj_snd = JaggedArray(gj_snd_data,ptrs)
+    jo_snd = JaggedArray(jo_snd_data,ptrs)
+    v_snd = JaggedArray(v_snd_data,ptrs)
+    gi_snd, gj_snd, jo_snd, v_snd
+  end
+  """
+    Pushes to coo_entries_with_column_owner the tuples 
+    gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes
+  """
+  function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
+    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
+    current_n = length(k_gi)
+    new_n = current_n + length(gi_rcv.data)
+    resize!(k_gi,new_n)
+    resize!(k_gj,new_n)
+    resize!(k_jo,new_n)
+    resize!(k_v,new_n)
+    for p in 1:length(gi_rcv.data)
+        k_gi[current_n+p] = gi_rcv.data[p]
+        k_gj[current_n+p] = gj_rcv.data[p]
+        k_jo[current_n+p] = jo_rcv.data[p]
+        k_v[current_n+p] = v_rcv.data[p]
+    end
+  end
+  part = linear_indices(row_partition)
+  parts_snd, parts_rcv = assembly_neighbors(row_partition)
+  coo_entries_with_column_owner = map(tuple,I,J,Jown,V)
+  gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays
+  graph = ExchangeGraph(parts_snd,parts_rcv)
+  t1 = exchange(gi_snd,graph)
+  t2 = exchange(gj_snd,graph)
+  t3 = exchange(jo_snd,graph)
+  t4 = exchange(v_snd,graph)
+  @async begin
+      gi_rcv = fetch(t1)
+      gj_rcv = fetch(t2)
+      jo_rcv = fetch(t3)
+      v_rcv = fetch(t4)
+      map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
+      I,J,Jown,V
+  end
+end
+
+# dofs_gids_prange can be either test_gids or trial_gids
+# In the former case, gids is a vector of global test dof identifiers, while in the 
+# latter, a vector of global trial dof identifiers
+function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...)
+  if !ghost
+    _setup_prange_without_ghosts(dofs_gids_prange)
+  elseif isa(owners,Nothing)
+    _setup_prange_with_ghosts(dofs_gids_prange,gids)
+  else
+    _setup_prange_with_ghosts(dofs_gids_prange,gids,owners)
+  end
+end
+
+function _setup_prange(
+  dofs_gids_prange::AbstractVector{<:PRange},
+  gids::AbstractMatrix;
+  ax=:rows,ghost=true,owners=nothing
+)
+  @check ax ∈ (:rows,:cols)
+  block_ids = LinearIndices(dofs_gids_prange)
+  pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x))
+
+  gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange
+    gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id]
+    gids_union = pvcat(gids_slice)
+
+    owners_union = nothing
+    if !isnothing(owners)
+      owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id]
+      owners_union = pvcat(owners_slice)
+    end
+
+    return gids_union, owners_union
+  end |> tuple_of_arrays
+  
+  return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union)
+end
+
+# Create PRange for the rows of the linear system
+# without local ghost dofs as per required in the 
+# LocallyAssembled() parallel assembly strategy 
+function _setup_prange_without_ghosts(dofs_gids_prange::PRange)
+  ngdofs = length(dofs_gids_prange)
+  indices = map(partition(dofs_gids_prange)) do dofs_indices 
+    owner = part_id(dofs_indices)
+    own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
+    ghost_indices = GhostIndices(ngdofs,Int64[],Int32[])
+    OwnAndGhostIndices(own_indices,ghost_indices)
+  end
+  return PRange(indices)
+end
+
+# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition
+# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system.
+# The precondition required for the consistency of any parallel assembly process in GridapDistributed 
+# is that each processor can determine locally with a single layer of ghost cells the global indices and associated 
+# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities 
+# (i.e., either cells or faces). 
+function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids)
+  ngdofs = length(dofs_gids_prange)
+  dofs_gids_partition = partition(dofs_gids_prange)
+
+  # Selected ghost ids -> dof PRange ghost ids
+  gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids)
+
+  # Selected ghost ids -> [global dof ids, owner processor ids]
+  gids_ghost_to_global, gids_ghost_to_owner = map(
+    find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition
+  ) |> tuple_of_arrays
+
+  return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner)
+end
+
+# Here we cannot assume that the sparse communication graph underlying 
+# trial_dofs_gids_partition is a superset of the one underlying indices.
+# Here we chould check whether it is included and call _find_neighbours!()
+# if this is the case. At present, we are not taking advantage of this, 
+# but let the parallel scalable algorithm to compute the reciprocal to do the job. 
+function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners)
+  ngdofs = length(dofs_gids_prange)
+  dofs_gids_partition = partition(dofs_gids_prange)
+
+  # Selected ghost ids -> [global dof ids, owner processor ids]
+  gids_ghost_to_global, gids_ghost_to_owner = map(
+    gids,owners,dofs_gids_partition) do gids, owners, indices
+    ghost_touched   = Dict{Int,Bool}()
+    ghost_to_global = Int64[] 
+    ghost_to_owner  = Int64[]
+    me = part_id(indices)
+    for (j,jo) in zip(gids,owners)
+      if jo != me
+        if !haskey(ghost_touched,j)
+          push!(ghost_to_global,j)
+          push!(ghost_to_owner,jo)
+          ghost_touched[j] = true
+        end
+      end
+    end
+    ghost_to_global, ghost_to_owner
+  end |> tuple_of_arrays
+
+  return _setup_prange_impl_(
+    ngdofs,
+    dofs_gids_partition,
+    gids_ghost_to_global,
+    gids_ghost_to_owner;
+    discover_neighbours=false
+  )
+end 
+
+function _setup_prange_impl_(
+  ngdofs,
+  dofs_gids_partition,
+  gids_ghost_to_global,
+  gids_ghost_to_owner;
+  discover_neighbours=true
+)
+  indices = map(dofs_gids_partition, 
+                gids_ghost_to_global, 
+                gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner 
+    owner = part_id(dofs_indices)
+    own_indices   = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
+    ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner)
+    OwnAndGhostIndices(own_indices,ghost_indices)
+  end
+  if discover_neighbours
+    _find_neighbours!(indices,dofs_gids_partition)
+  end
+  return PRange(indices)
+end
diff --git a/test/BlockSparseMatrixAssemblersTests.jl b/test/BlockSparseMatrixAssemblersTests.jl
index 82bafc97..645d7dc2 100644
--- a/test/BlockSparseMatrixAssemblersTests.jl
+++ b/test/BlockSparseMatrixAssemblersTests.jl
@@ -52,7 +52,7 @@ function _main(n_spaces,mfs,weakform,U,V)
   matdata = collect_cell_matrix(X,Y,biform(u,v))
   vecdata = collect_cell_vector(Y,liform(v))  
 
-  assem = SparseMatrixAssembler(X,Y,FullyAssembledRows())
+  assem = SparseMatrixAssembler(X,Y,LocallyAssembled())
   A1 = assemble_matrix(assem,matdata)
   b1 = assemble_vector(assem,vecdata)
   A2,b2 = assemble_matrix_and_vector(assem,data);
@@ -68,7 +68,7 @@ function _main(n_spaces,mfs,weakform,U,V)
   bmatdata = collect_cell_matrix(Xb,Yb,biform(ub,vb))
   bvecdata = collect_cell_vector(Yb,liform(vb))
 
-  assem_blocks = SparseMatrixAssembler(Xb,Yb,FullyAssembledRows())
+  assem_blocks = SparseMatrixAssembler(Xb,Yb,LocallyAssembled())
   A1_blocks = assemble_matrix(assem_blocks,bmatdata);
   b1_blocks = assemble_vector(assem_blocks,bvecdata);
   @test is_same_vector(b1_blocks,b1,Yb,Y)
diff --git a/test/DivConformingTests.jl b/test/DivConformingTests.jl
index 2a1886b3..38044564 100644
--- a/test/DivConformingTests.jl
+++ b/test/DivConformingTests.jl
@@ -63,7 +63,7 @@ function f(model,reffe)
     V = FESpace(model,reffe,conformity=:Hdiv)
     U = TrialFESpace(V)
 
-    das = FullyAssembledRows()
+    das = LocallyAssembled()
     trian = Triangulation(das,model)
     degree = 2
     dΩ = Measure(trian,degree)
diff --git a/test/FESpacesTests.jl b/test/FESpacesTests.jl
index bf1e78e6..22e35453 100644
--- a/test/FESpacesTests.jl
+++ b/test/FESpacesTests.jl
@@ -71,8 +71,8 @@ function assemble_tests(das,dΩ,dΩass,U,V)
 end
 
 function main(distribute,parts)
-  main(distribute,parts,SubAssembledRows())
-  main(distribute,parts,FullyAssembledRows())
+  main(distribute,parts,Assembled())
+  main(distribute,parts,LocallyAssembled())
 end
 
 function main(distribute,parts,das)
diff --git a/test/HeatEquationTests.jl b/test/HeatEquationTests.jl
index fa96dc11..89077ecb 100644
--- a/test/HeatEquationTests.jl
+++ b/test/HeatEquationTests.jl
@@ -37,7 +37,7 @@ function main(distribute,parts)
 
   op = TransientFEOperator(res,jac,jac_t,U,V0)
 
-  assembler = SparseMatrixAssembler(U,V0,SubAssembledRows())
+  assembler = SparseMatrixAssembler(U,V0,Assembled())
   op_constant = TransientLinearFEOperator(
     (a,m),b,U,V0,constant_forms=(true,true),assembler=assembler
   )
diff --git a/test/PLaplacianTests.jl b/test/PLaplacianTests.jl
index 2347dfc4..c64ed23f 100644
--- a/test/PLaplacianTests.jl
+++ b/test/PLaplacianTests.jl
@@ -8,8 +8,8 @@ using Test
 using SparseArrays
 
 function main(distribute,parts)
-  main(distribute,parts,FullyAssembledRows(),SparseMatrixCSR{0,Float64,Int})
-  main(distribute,parts,SubAssembledRows(),SparseMatrixCSC{Float64,Int})
+  main(distribute,parts,LocallyAssembled(),SparseMatrixCSR{0,Float64,Int})
+  main(distribute,parts,Assembled(),SparseMatrixCSC{Float64,Int})
 end
 
 function main(distribute,parts,strategy,local_matrix_type)
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index 951a68d7..ae00925e 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -21,6 +21,8 @@ dist_V = TestFESpace(dist_model,reffe)
 serial_ids = get_free_dof_ids(serial_V)
 dist_ids = get_free_dof_ids(dist_V)
 
+dof_ids = get_free_dof_ids(dist_V)
+
 serial_Ω = Triangulation(serial_model)
 serial_dΩ = Measure(serial_Ω,2)
 
@@ -31,6 +33,55 @@ serial_a1(u,v) = ∫(u⋅v)*serial_dΩ
 serial_A1 = assemble_matrix(serial_a1,serial_V,serial_V)
 
 dist_a1(u,v) = ∫(u⋅v)*dist_dΩ
+assem = SparseMatrixAssembler(dist_V,dist_V)
 dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V)
 all(centralize(dist_A1) - serial_A1 .< 1e-10)
 
+function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
+  K = zeros(Int32,length(I))
+  for (p,(i,j)) in enumerate(zip(I,J))
+      if !skip && (i < 1 || j < 1)
+          continue
+      end
+      K[p] = nzindex(A,i,j)
+  end
+  K
+end
+
+
+
+############################################################################################
+
+ids = cids
+indices = partition(ids)
+
+n_own = own_length(cids)
+ghost2global = ghost_to_global(cids)
+ghost2owner = ghost_to_owner(cids)
+
+first_gid = scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
+n_global = reduce(+,n_own)
+n_parts = length(ranks)
+indices2 = map(ranks,n_own,first_gid,ghost2global,ghost2owner) do rank,n_own,start,ghost2global,ghost2owner
+  p = CartesianIndex((rank,))
+  np = (n_parts,)
+  n = (n_global,)
+  ranges = ((1:n_own).+(start-1),)
+  ghost = GhostIndices(n_global,ghost2global,ghost2owner)
+
+  PartitionedArrays.LocalIndicesWithVariableBlockSize(p,np,n,ranges,ghost)
+end
+
+map(indices,indices2) do indices, indices2
+  ghost2local = ghost_to_local(indices)
+  own2local = own_to_local(indices)
+
+  n_own = own_length(indices)
+  n_ghost = ghost_length(indices)
+
+  #perm = vcat(own2local,ghost2local)
+  perm = fill(0,local_length(indices))
+  perm[own2local] .= 1:n_own
+  perm[ghost2local] .= (n_own+1):(n_ghost+n_own)
+  permute_indices(indices2,perm)
+end

From da06a4b4a276897863a82275b7c431a7f2beeaef Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Sun, 8 Sep 2024 00:31:31 +1000
Subject: [PATCH 05/14] Saving progress

---
 src/Assembly.jl      | 137 ++++++++++++++++++++++++++++++++++++++++---
 src/PArraysExtras.jl |  55 ++++++++++++++++-
 2 files changed, 182 insertions(+), 10 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 25e2411f..10e6d976 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -84,6 +84,20 @@ local_views(a::DistributedAllocation) = a.allocs
 const PVectorAllocation{S,T} = DistributedAllocation{S,T,1}
 const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2}
 
+function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation})
+  map(local_views(a),partition(axes(a,1))) do a, ids
+    rows = remove_ghost(unpermute(ids))
+
+    ghost_lids = ghost_to_local(ids)
+    ghost_owners = ghost_to_owner(ids)
+    touched_ghost_lids = filter(lid -> a.touched[lid],ghost_lids)
+    touched_ghost_owners = collect(ghost_owners[touched_ghost_lids])
+    touched_ghost_gids = to_global!(touched_ghost_lids,ids)
+    ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners)
+    return replace_ghost(rows,ghost)
+  end
+end
+
 # PSparseMatrix assembly chain
 #
 #   1 - nz_counter(PSparseMatrixBuilder) -> PSparseMatrixCounter
@@ -104,6 +118,21 @@ function Algebra.nz_allocation(a::PSparseMatrixCounter)
   DistributedAllocation(allocs,a.axes,a.strategy)
 end
 
+function Algebra.create_from_nz(a::PSparseMatrix)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled})
+  A, = create_from_nz_locally_assembled(a)
+  return A
+end
+
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled})
+  A, = create_from_nz_assembled(a,nothing)
+  return A
+end
+
 # PVector assembly chain:
 #
 #   1 - nz_counter(PVectorBuilder) -> PVectorCounter
@@ -142,13 +171,16 @@ function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVecto
 end
 
 function Algebra.create_from_nz(a::PVectorAllocation{<:LocallyAssembled,<:AbstractVector})
-  rows = _setup_prange_without_ghosts(axes(a,1))
-  _rhs_callback(a,rows)
+  rows = remove_ghost(unpermute(axes(a,1)))
+  values = local_views(a)
+  b = rhs_callback(values,rows)
+  return b
 end
 
 function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
-  rows = _setup_prange_from_pvector_allocation(a)
-  b    = _rhs_callback(a,rows)
+  rows = collect_touched_ids(a)
+  values = map(ai -> ai.values, local_views(a))
+  b    = rhs_callback(values,rows)
   t2   = assemble!(b)
   if t2 !== nothing
     wait(t2)
@@ -162,7 +194,7 @@ end
 # overlap communications the assembly of the matrix and the vector.
 # Not only it is faster, but also necessary to ensure identical ghost indices 
 # in both the matrix and vector rows.
-# This is done by using the following specializations chain:
+# This is done by using the following specializations:
 
 function Arrays.nz_allocation(
   a::PSparseMatrixCounter{<:Assembled},
@@ -178,9 +210,12 @@ function Algebra.create_from_nz(
   b::PVectorAllocation{<:LocallyAssembled,<:AbstractVector}
 )
   function callback(rows)
-    _rhs_callback(b,rows)
+    new_indices = partition(rows)
+    values = local_views(b)
+    b_fespace = PVector(values,partition(axes(b,1)))
+    locally_repartition(b_fespace,new_indices)
   end
-  A, B = _fa_create_from_nz_with_callback(callback,a)
+  A, B = create_from_nz_locally_assembled(a,callback)
   return A, B
 end
 
@@ -189,11 +224,95 @@ function Algebra.create_from_nz(
   b::PVectorAllocation{<:Assembled,<:TrackedArrayAllocation}
 )
   function callback(rows)
-    _rhs_callback(b,rows)
+    new_indices = collect_touched_ids(b)
+    values = map(ai -> ai.values, local_views(b))
+    b_fespace = PVector(values,partition(axes(b,1)))
+    locally_repartition(b_fespace,new_indices)
   end
   function async_callback(b)
     assemble!(b)
   end
-  A, B = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
+  A, B = create_from_nz_assembled(a,callback,async_callback)
   return A, B
 end
+
+# Assembly methods
+
+function create_from_nz_locally_assembled(
+  a,
+  callback::Function = rows -> nothing
+)
+  # Recover some data
+  I,J,V = get_allocations(a)
+  test_gids, trial_gids = axes(a)
+
+  rows = remove_ghost(unpermute(test_gids))
+  b = callback(rows)
+
+  # convert I and J to global dof ids
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
+
+  # Create the range for cols
+  cols = _setup_prange(trial_gids,J;ax=:cols)
+
+  # Convert again I,J to local numeration
+  to_local_indices!(I,rows;ax=:rows)
+  to_local_indices!(J,cols;ax=:cols)
+
+  A = _setup_matrix(I,J,V,rows,cols)
+  return A, b
+end
+
+function create_from_nz_assembled(
+  a, 
+  callback::Function = rows -> nothing,
+  async_callback::Function = b -> nothing
+)
+  # Recover some data
+  I,J,V = get_allocations(a)
+  test_gids = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
+
+  # convert I and J to global dof ids
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
+
+  # Create the Prange for the rows
+  rows = _setup_prange(test_gids,I;ax=:rows)
+
+  # Move (I,J,V) triplets to the owner process of each row I.
+  # The triplets are accompanyed which Jo which is the process column owner
+  Jo = get_gid_owners(J,trial_gids;ax=:cols)
+  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
+
+  # Here we can overlap computations
+  # This is a good place to overlap since
+  # sending the matrix rows is a lot of data
+  if !isa(b,Nothing)
+    bprange=_setup_prange_from_pvector_allocation(b)
+    b = callback(bprange)
+  end
+
+  # Wait the transfer to finish
+  wait(t)
+
+  # Create the Prange for the cols
+  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
+
+  # Overlap rhs communications with CSC compression
+  t2 = async_callback(b)
+
+  # Convert again I,J to local numeration
+  to_local_indices!(I,rows;ax=:rows)
+  to_local_indices!(J,cols;ax=:cols)
+
+  A = _setup_matrix(a,I,J,V,rows,cols)
+
+  # Wait the transfer to finish
+  if !isa(t2,Nothing)
+    wait(t2)
+  end
+
+  return A, b
+end
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index f4a5ae2e..1a1d84de 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -45,10 +45,63 @@ function permuted_variable_partition(
   end
 end
 
+"""
+    unpermute(indices::AbstractLocalIndices)
+
+Given local indices, reorders them to (locally) have own indices first, 
+followed by ghost indices.
+"""
+function unpermute(indices::AbstractLocalIndices) 
+  @notimplemented
+end
+
+unpermute(indices::PermutedLocalIndices) = unpermute(indices.indices)
+unpermute(indices::PartitionedArrays.LocalIndicesInBlockPartition) = indices
+unpermute(indices::OwnAndGhostIndices) = indices
+
+function unpermute(indices::LocalIndices)
+  nglobal = global_length(indices)
+  rank = part_id(indices)
+  own = OwnIndices(nglobal,rank,own_to_global(indices))
+  ghost = GhostIndices(nglobal,ghost_to_global(indices),ghost_to_owner(indices))
+  OwnAndGhostIndices(own,ghost,global_to_owner(indices))
+end
+
+"""
+    locally_repartition(v::PVector,new_indices)
+
+Map the values of a PVector to a new partitioning of the indices.
+
+Similar to `PartitionedArrays.repartition`, but without any communications. Instead, 
+it is assumed that the local-to-local mapping can be done locally.
+"""
+function locally_repartition(v::PVector,new_indices)
+  w = similar(v,PRange(new_indices))
+  locally_repartition!(w,v)
+end
+
+function locally_repartition!(w::PVector,v::PVector)
+  # Fill own values
+  map(copy!,own_values(w),own_values(v))
+
+  # Fill ghost values
+  new_indices = partition(axes(w,1))
+  old_indices = partition(axes(v,1))
+  map(partition(w),partition(v),new_indices,old_indices) do w,v,new_ids,old_ids
+    old_gid_to_lid = global_to_local(old_ids)
+    for (lid,gid) in zip(ghost_to_local(new_ids),ghost_to_global(new_ids))
+      old_lid = old_gid_to_lid[gid]
+      w[lid] = v[old_lid]
+    end
+  end
+
+  return w
+end
+
 # Linear algebra
 
 function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
-  @check partition(axes(x,1)) === partition(axes(y,1))
+  @check matching_local_indices(partition(axes(x,1)),partition(axes(y,1)))
   map(partition(x),partition(y)) do x,y
     LinearAlgebra.axpy!(α,x,y)
   end

From fe547432c393ef7bbff1da8359efd72759e41376 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 17 Sep 2024 13:50:20 +1000
Subject: [PATCH 06/14] Minor

---
 src/Assembly.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 10e6d976..f440fd49 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -236,7 +236,7 @@ function Algebra.create_from_nz(
   return A, B
 end
 
-# Assembly methods
+# Low-level assembly methods
 
 function create_from_nz_locally_assembled(
   a,

From 8bacdf25f109c408861c92060774dc594ef80033 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 22 Oct 2024 01:02:47 +1100
Subject: [PATCH 07/14] More fixes

---
 src/Assembly.jl          | 43 ++++++++++++++++++++++++----------------
 src/FESpaces.jl          | 12 ++++-------
 src/GridapDistributed.jl |  4 ++--
 src/PArraysExtras.jl     | 30 ++++++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index f440fd49..1797a996 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -42,7 +42,7 @@ struct DistributedCounter{S,T,N,A,B} <: GridapType
   function DistributedCounter(
     counters :: AbstractArray{T},
     axes     :: NTuple{N,<:PRange},
-    strategy :: Algebra.AssemblyStrategy
+    strategy :: AssemblyStrategy
   ) where {T,N}
     A, B, S = typeof(counters), typeof(axes), typeof(strategy)
     new{S,T,N,A,B}(counters,axes,strategy)
@@ -52,10 +52,10 @@ end
 Base.axes(a::DistributedCounter) = a.axes
 local_views(a::DistributedCounter) = a.counters
 
-const PVectorCounter{S,T} = DistributedCounter{S,T<:Algebra.ArrayCounter,1}
+const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B}
 Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
 
-const PSparseMatrixCounter{S,T} = DistributedCounter{S,T<:Algebra.CounterCOO,2}
+const PSparseMatrixCounter{S,T,A,B} = DistributedCounter{S,T,2,A,B}
 Algebra.LoopStyle(::Type{<:PSparseMatrixCounter}) = Loop()
 
 """
@@ -71,7 +71,7 @@ struct DistributedAllocation{S,T,N,A,B} <: GridapType
   function DistributedAllocation(
     allocs   :: AbstractArray{T},
     axes     :: NTuple{N,<:PRange},
-    strategy :: Algebra.AssemblyStrategy
+    strategy :: AssemblyStrategy
   ) where {T,N}
     A, B, S = typeof(allocs), typeof(axes), typeof(strategy)
     new{S,T,N,A,B}(allocs,axes,strategy)
@@ -84,6 +84,13 @@ local_views(a::DistributedAllocation) = a.allocs
 const PVectorAllocation{S,T} = DistributedAllocation{S,T,1}
 const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2}
 
+function get_allocations(a::PSparseMatrixAllocation{S,<:Algebra.AllocationCOO}) where S
+  I,J,V = map(local_views(a)) do alloc
+    alloc.I, alloc.J, alloc.V
+  end |> tuple_of_arrays
+  return I,J,V
+end
+
 function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation})
   map(local_views(a),partition(axes(a,1))) do a, ids
     rows = remove_ghost(unpermute(ids))
@@ -104,13 +111,13 @@ end
 #   2 - nz_allocation(PSparseMatrixCounter) -> PSparseMatrixAllocation
 #   3 - create_from_nz(PSparseMatrixAllocation) -> PSparseMatrix
 
-function Algebra.nz_counter(builder::PSparseMatrixBuilder,axs::NTuple{2,<:PRange})
+function Algebra.nz_counter(builder::PSparseMatrixBuilder{MT},axs::NTuple{2,<:PRange}) where MT
   rows, cols = axs # test ids, trial ids
-  counters = map(partition(rows),partition(cols)) do rows,cols
-    axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols)))
-    Algebra.CounterCOO{A}(axs)
+  counters = map(partition(rows),partition(cols)) do rows, cols
+    local_axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols)))
+    Algebra.CounterCOO{MT}(local_axs)
   end
-  DistributedCounter(builder.par_strategy,counters,rows,cols)
+  DistributedCounter(counters,axs,builder.strategy)
 end
 
 function Algebra.nz_allocation(a::PSparseMatrixCounter)
@@ -129,7 +136,9 @@ function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled})
 end
 
 function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled})
-  A, = create_from_nz_assembled(a,nothing)
+  callback(rows) = nothing
+  async_callback(b) = nothing
+  A, = create_from_nz_assembled(a,callback,async_callback)
   return A
 end
 
@@ -180,8 +189,8 @@ end
 function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
   rows = collect_touched_ids(a)
   values = map(ai -> ai.values, local_views(a))
-  b    = rhs_callback(values,rows)
-  t2   = assemble!(b)
+  b  = rhs_callback(values,rows)
+  t2 = assemble!(b)
   if t2 !== nothing
     wait(t2)
   end
@@ -271,19 +280,19 @@ function create_from_nz_assembled(
 )
   # Recover some data
   I,J,V = get_allocations(a)
-  test_gids = get_test_gids(a)
-  trial_gids = get_trial_gids(a)
+  test_gids, trial_gids = axes(a)
 
   # convert I and J to global dof ids
   to_global_indices!(I,test_gids;ax=:rows)
   to_global_indices!(J,trial_gids;ax=:cols)
 
   # Create the Prange for the rows
-  rows = _setup_prange(test_gids,I;ax=:rows)
+  rows = unpermute(test_gids)
+  # rows = replace_ghost(unpermute(test_gids),I,find_owner(test_gids,I))
 
   # Move (I,J,V) triplets to the owner process of each row I.
-  # The triplets are accompanyed which Jo which is the process column owner
-  Jo = get_gid_owners(J,trial_gids;ax=:cols)
+  J_owners = find_owner(trial_gids,J)
+  cols = union_ghost(unpermute(trial_gids),J,J_owners)
   t  = _assemble_coo!(I,J,V,rows;owners=Jo)
 
   # Here we can overlap computations
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 02530cfa..e7aabe3d 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -603,15 +603,11 @@ FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_buil
 FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
 
 function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+  map(symbolic_loop_matrix!,local_views(A),local_views(a),matdata)
 end
 
 function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+  map(numeric_loop_matrix!,local_views(A),local_views(a),matdata)
 end
 
 function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
@@ -656,7 +652,7 @@ function FESpaces.assemble_matrix!(mat::PSparseMatrix,a::DistributedSparseMatrix
   create_from_nz!(mat,allocs,cache)
 end
 
-function create_from_nz!(A,a::DistributedAllocationCOO{<:Assembled},cache)
+function create_from_nz!(A,a::DistributedAllocation{<:Assembled},cache)
   _,_,V = get_allocations(a)
   psparse!(A,V,cache) |> wait
   return A
@@ -699,7 +695,7 @@ function FESpaces.SparseMatrixAssembler(
       local_strategy
     )
   end
-  mat_builder = PSparseMatrixBuilderCOO(local_mat_type,par_strategy)
+  mat_builder = PSparseMatrixBuilder(local_mat_type,par_strategy)
   vec_builder = PVectorBuilder(local_vec_type,par_strategy)
   if reuse_caches
     caches = Dict{UInt64,Any}()
diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl
index b55ddfe8..4e31dcd3 100644
--- a/src/GridapDistributed.jl
+++ b/src/GridapDistributed.jl
@@ -42,11 +42,11 @@ export with_ghost, no_ghost
 
 export redistribute
 
+include("BlockPartitionedArrays.jl")
+
 include("GridapExtras.jl")
 include("PArraysExtras.jl")
 
-include("BlockPartitionedArrays.jl")
-
 include("Algebra.jl")
 
 include("Assembly.jl")
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 1a1d84de..19c97149 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -161,6 +161,36 @@ function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
   end
 end
 
+# To local/to global for blocks 
+
+function to_local_indices!(I,ids::PRange;kwargs...)
+  map(to_local!,I,partition(ids))
+end
+
+function to_global_indices!(I,ids::PRange;kwargs...)
+  map(to_global!,I,partition(ids))
+end
+for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
+  @eval begin
+    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
+      map($f,I,ids)
+    end
+
+    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
+      @check ax ∈ [:rows,:cols]
+      block_ids = CartesianIndices(I)
+      map(block_ids) do id
+        i = id[1]; j = id[2];
+        if ax == :rows
+          $f(I[i,j],ids[i])
+        else
+          $f(I[i,j],ids[j])
+        end
+      end
+    end
+  end
+end
+
 # This type is required because MPIArray from PArrays 
 # cannot be instantiated with a NULL communicator
 struct MPIVoidVector{T} <: AbstractVector{T}

From a21abd7a6438dc60fc8eed3bd4ed853636a851e5 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 5 Nov 2024 19:02:35 +1100
Subject: [PATCH 08/14] We can assemble again!

---
 src/Assembly.jl        | 52 +++++++++++++++++++++++-------------------
 src/GridapExtras.jl    | 13 +++++++++++
 src/PArraysExtras.jl   | 13 +++++++++++
 test/parrays_update.jl | 14 ++++++++++++
 4 files changed, 69 insertions(+), 23 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 1797a996..e0b33745 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -50,6 +50,7 @@ struct DistributedCounter{S,T,N,A,B} <: GridapType
 end
 
 Base.axes(a::DistributedCounter) = a.axes
+Base.axes(a::DistributedCounter,d::Integer) = a.axes[d]
 local_views(a::DistributedCounter) = a.counters
 
 const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B}
@@ -79,8 +80,18 @@ struct DistributedAllocation{S,T,N,A,B} <: GridapType
 end
 
 Base.axes(a::DistributedAllocation) = a.axes
+Base.axes(a::DistributedAllocation,d::Integer) = a.axes[d]
 local_views(a::DistributedAllocation) = a.allocs
 
+function change_axes(a::DistributedAllocation{S,T,N},axes::NTuple{N,<:PRange}) where {S,T,N}
+  indices = map(partition,axes)
+  local_axes = map(indices...) do indices...
+    map(ids -> Base.OneTo(local_length(ids)), indices)
+  end
+  allocs = map(change_axes,a.allocs,local_axes)
+  DistributedAllocation(allocs,axes,a.strategy)
+end
+
 const PVectorAllocation{S,T} = DistributedAllocation{S,T,1}
 const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2}
 
@@ -97,9 +108,9 @@ function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation})
 
     ghost_lids = ghost_to_local(ids)
     ghost_owners = ghost_to_owner(ids)
-    touched_ghost_lids = filter(lid -> a.touched[lid],ghost_lids)
+    touched_ghost_lids = filter(lid -> a.touched[lid], ghost_lids)
     touched_ghost_owners = collect(ghost_owners[touched_ghost_lids])
-    touched_ghost_gids = to_global!(touched_ghost_lids,ids)
+    touched_ghost_gids = to_global!(touched_ghost_lids, ids)
     ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners)
     return replace_ghost(rows,ghost)
   end
@@ -280,43 +291,38 @@ function create_from_nz_assembled(
 )
   # Recover some data
   I,J,V = get_allocations(a)
-  test_gids, trial_gids = axes(a)
+  test_ids = partition(axes(a,1))
+  trial_ids = partition(axes(a,2))
 
   # convert I and J to global dof ids
-  to_global_indices!(I,test_gids;ax=:rows)
-  to_global_indices!(J,trial_gids;ax=:cols)
+  map(map_local_to_global!,I,test_ids)
+  map(map_local_to_global!,J,trial_ids)
 
-  # Create the Prange for the rows
-  rows = unpermute(test_gids)
+  # Move (I,J,V) triplets to the owner process of each row I.
+  rows = map(unpermute,test_ids)
   # rows = replace_ghost(unpermute(test_gids),I,find_owner(test_gids,I))
+  t = PartitionedArrays.assemble_coo!(I,J,V,rows)
 
-  # Move (I,J,V) triplets to the owner process of each row I.
-  J_owners = find_owner(trial_gids,J)
-  cols = union_ghost(unpermute(trial_gids),J,J_owners)
-  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
-
-  # Here we can overlap computations
-  # This is a good place to overlap since
-  # sending the matrix rows is a lot of data
-  if !isa(b,Nothing)
-    bprange=_setup_prange_from_pvector_allocation(b)
-    b = callback(bprange)
-  end
+  # Overlap CSC communication with rhs assembly
+  b = callback(rows)
 
   # Wait the transfer to finish
   wait(t)
 
   # Create the Prange for the cols
-  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
+  J_owners = find_owner(trial_ids,J)
+  cols = map(union_ghost,map(unpermute,trial_ids),J,J_owners)
 
   # Overlap rhs communications with CSC compression
   t2 = async_callback(b)
 
   # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
+  map(map_global_to_local!,I,rows)
+  map(map_global_to_local!,J,cols)
 
-  A = _setup_matrix(a,I,J,V,rows,cols)
+  a_sys = change_axes(a,(PRange(rows),PRange(cols)))
+  values = map(create_from_nz,local_views(a_sys))
+  A = PSparseMatrix(values,rows,cols,true)
 
   # Wait the transfer to finish
   if !isa(t2,Nothing)
diff --git a/src/GridapExtras.jl b/src/GridapExtras.jl
index df5b1186..e0723189 100644
--- a/src/GridapExtras.jl
+++ b/src/GridapExtras.jl
@@ -53,3 +53,16 @@ end
   end
   nothing
 end
+
+# change_axes
+
+function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
+  b = Algebra.CounterCOO{T}(axes)
+  b.nnz = a.nnz
+  b
+end
+
+function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
+  counter = change_axes(a.counter,axes)
+  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
+end
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 19c97149..be907629 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -98,6 +98,19 @@ function locally_repartition!(w::PVector,v::PVector)
   return w
 end
 
+# SubSparseMatrix extensions
+
+function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
+  I,J,V = findnz(A.parent)
+  rowmap, colmap = A.inv_indices
+  for k in eachindex(I)
+    I[k] = rowmap[I[k]]
+    J[k] = colmap[J[k]]
+  end
+  mask = map((i,j) -> (i > 0 && j > 0), I, J)
+  return I[mask], J[mask], V[mask]
+end
+
 # Linear algebra
 
 function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index ae00925e..9e396812 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -37,6 +37,8 @@ assem = SparseMatrixAssembler(dist_V,dist_V)
 dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V)
 all(centralize(dist_A1) - serial_A1 .< 1e-10)
 
+centralize(dist_A1)
+
 function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
   K = zeros(Int32,length(I))
   for (p,(i,j)) in enumerate(zip(I,J))
@@ -48,7 +50,19 @@ function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
   K
 end
 
+Aoo = own_own_values(dist_A1).items[1]
 
+using SparseArrays
+function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
+  I,J,V = findnz(A.parent)
+  rowmap, colmap = A.inv_indices
+  for k in eachindex(I)
+    I[k] = rowmap[I[k]]
+    J[k] = colmap[J[k]]
+  end
+  mask = map((i,j) -> (i > 0 && j > 0), I, J)
+  return I[mask], J[mask], V[mask]
+end
 
 ############################################################################################
 

From 07cb9a4a307527682374cdbcb8080680b4f570e1 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 5 Nov 2024 19:45:40 +1100
Subject: [PATCH 09/14] Added the rest of assembly startegies

---
 src/Assembly.jl      | 73 +++++++++++++++++++++++++-------------------
 src/PArraysExtras.jl |  4 +++
 2 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index e0b33745..a5f10c9c 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -262,34 +262,37 @@ function create_from_nz_locally_assembled(
   a,
   callback::Function = rows -> nothing
 )
-  # Recover some data
   I,J,V = get_allocations(a)
-  test_gids, trial_gids = axes(a)
+  test_ids = partition(axes(a,1))
+  trial_ids = partition(axes(a,2))
 
-  rows = remove_ghost(unpermute(test_gids))
+  rows = map(remove_ghost,map(unpermute,test_ids))
   b = callback(rows)
 
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_gids;ax=:rows)
-  to_global_indices!(J,trial_gids;ax=:cols)
+  map(map_local_to_global!,I,test_ids)
+  map(map_local_to_global!,J,trial_ids)
 
-  # Create the range for cols
-  cols = _setup_prange(trial_gids,J;ax=:cols)
+  # TODO: replace_ghost or union_ghost?
+  # Actually, do we want to change the ghosts at all? We could return the unpermuted trial_ids
+  J_owners = find_owner(trial_ids,J)
+  cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners)
 
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
+  map(map_global_to_local!,I,rows)
+  map(map_global_to_local!,J,cols)
+
+  assembled = true
+  a_sys = change_axes(a,(PRange(rows),PRange(cols)))
+  values = map(create_from_nz,local_views(a_sys))
+  A = PSparseMatrix(values,rows,cols,assembled)
 
-  A = _setup_matrix(I,J,V,rows,cols)
   return A, b
 end
 
 function create_from_nz_assembled(
-  a, 
+  a,
   callback::Function = rows -> nothing,
-  async_callback::Function = b -> nothing
+  async_callback::Function = b -> empty_async_task
 )
-  # Recover some data
   I,J,V = get_allocations(a)
   test_ids = partition(axes(a,1))
   trial_ids = partition(axes(a,2))
@@ -298,36 +301,44 @@ function create_from_nz_assembled(
   map(map_local_to_global!,I,test_ids)
   map(map_local_to_global!,J,trial_ids)
 
-  # Move (I,J,V) triplets to the owner process of each row I.
+  # Overlapped COO communication and vector assembly
   rows = map(unpermute,test_ids)
-  # rows = replace_ghost(unpermute(test_gids),I,find_owner(test_gids,I))
   t = PartitionedArrays.assemble_coo!(I,J,V,rows)
-
-  # Overlap CSC communication with rhs assembly
   b = callback(rows)
-
-  # Wait the transfer to finish
   wait(t)
 
-  # Create the Prange for the cols
-  J_owners = find_owner(trial_ids,J)
-  cols = map(union_ghost,map(unpermute,trial_ids),J,J_owners)
-
   # Overlap rhs communications with CSC compression
   t2 = async_callback(b)
+  J_owners = find_owner(trial_ids,J)
+  cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners) # TODO: replace_ghost or union_ghost?
 
-  # Convert again I,J to local numeration
   map(map_global_to_local!,I,rows)
   map(map_global_to_local!,J,cols)
 
+  assembled = true
   a_sys = change_axes(a,(PRange(rows),PRange(cols)))
   values = map(create_from_nz,local_views(a_sys))
-  A = PSparseMatrix(values,rows,cols,true)
+  A = PSparseMatrix(values,rows,cols,assembled)
 
-  # Wait the transfer to finish
-  if !isa(t2,Nothing)
-    wait(t2)
-  end
+  wait(t2)
+  return A, b
+end
+
+function create_from_nz_subassembled(
+  a,
+  callback::Function = rows -> nothing,
+  async_callback::Function = b -> empty_async_task
+)
+  rows = partition(axes(a,1))
+  cols = partition(axes(a,2))
+
+  b = callback(rows)
+  t2 = async_callback(b)
+
+  assembled = false
+  values = map(create_from_nz,local_views(a))
+  A = PSparseMatrix(values,rows,cols,assembled)
 
+  wait(t2)
   return A, b
 end
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index be907629..4d87e2b0 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -111,6 +111,10 @@ function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
   return I[mask], J[mask], V[mask]
 end
 
+# Async tasks
+
+const empty_async_task = PartitionedArrays.FakeTask(x -> nothing)
+
 # Linear algebra
 
 function LinearAlgebra.axpy!(α,x::PVector,y::PVector)

From a820d9a6979b68f5808e784e0816530c430b1c45 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 5 Nov 2024 23:31:50 +1100
Subject: [PATCH 10/14] Bugfixes

---
 src/Assembly.jl        |  9 +++----
 src/PArraysExtras.jl   | 53 +++++++++++++++++++++++++++++++++++++++++-
 test/parrays_update.jl | 17 ++------------
 3 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index a5f10c9c..021e8151 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -147,9 +147,7 @@ function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled})
 end
 
 function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled})
-  callback(rows) = nothing
-  async_callback(b) = nothing
-  A, = create_from_nz_assembled(a,callback,async_callback)
+  A, = create_from_nz_assembled(a)
   return A
 end
 
@@ -302,15 +300,14 @@ function create_from_nz_assembled(
   map(map_local_to_global!,J,trial_ids)
 
   # Overlapped COO communication and vector assembly
-  rows = map(unpermute,test_ids)
+  rows = filter_and_replace_ghost(map(unpermute,test_ids),I)
   t = PartitionedArrays.assemble_coo!(I,J,V,rows)
   b = callback(rows)
   wait(t)
 
   # Overlap rhs communications with CSC compression
   t2 = async_callback(b)
-  J_owners = find_owner(trial_ids,J)
-  cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners) # TODO: replace_ghost or union_ghost?
+  cols = filter_and_replace_ghost(map(unpermute,trial_ids),J)
 
   map(map_global_to_local!,I,rows)
   map(map_global_to_local!,J,cols)
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 4d87e2b0..05c82163 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -98,6 +98,57 @@ function locally_repartition!(w::PVector,v::PVector)
   return w
 end
 
+"""
+    filter_and_replace_ghost(indices,gids)
+
+Replace ghost ids in `indices` with the ghost ids within `gids`.
+"""
+function filter_and_replace_ghost(indices,gids)
+  owners = find_owner(indices,gids)
+  new_indices = map(indices,gids,owners) do indices, gids, owners
+    ghost_gids, ghost_owners = _filter_ghost(indices,gids,owners)
+    replace_ghost(indices, ghost_gids, ghost_owners)
+  end
+  return new_indices
+end
+
+# Same as PartitionedArrays.filter_ghost, but we do not exclude ghost indices that 
+# belong to `indices`.
+function _filter_ghost(indices,gids,owners)
+  ghosts = Set{Int}()
+  part_owner = part_id(indices)
+
+  n_ghost = 0
+  for (gid,owner) in zip(gids,owners)
+    if gid < 1
+      continue
+    end
+    if (owner != part_owner) && !(gid in ghosts)
+      n_ghost += 1
+      push!(ghosts,gid)
+    end
+  end
+
+  new_ghost_to_global = zeros(Int,n_ghost)
+  new_ghost_to_owner = zeros(Int32,n_ghost)
+
+  empty!(ghosts)
+  n_ghost = 0
+  for (gid,owner) in zip(gids,owners)
+    if gid < 1
+      continue
+    end
+    if (owner != part_owner) && !(gid in ghosts)
+      n_ghost += 1
+      new_ghost_to_global[n_ghost] = gid
+      new_ghost_to_owner[n_ghost] = owner
+      push!(ghosts,gid)
+    end
+  end
+
+  return new_ghost_to_global, new_ghost_to_owner
+end
+
 # SubSparseMatrix extensions
 
 function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
@@ -113,7 +164,7 @@ end
 
 # Async tasks
 
-const empty_async_task = PartitionedArrays.FakeTask(x -> nothing)
+const empty_async_task = PartitionedArrays.FakeTask(() -> nothing)
 
 # Linear algebra
 
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index 9e396812..fd62332b 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -37,7 +37,8 @@ assem = SparseMatrixAssembler(dist_V,dist_V)
 dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V)
 all(centralize(dist_A1) - serial_A1 .< 1e-10)
 
-centralize(dist_A1)
+
+############################################################################################
 
 function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
   K = zeros(Int32,length(I))
@@ -50,20 +51,6 @@ function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
   K
 end
 
-Aoo = own_own_values(dist_A1).items[1]
-
-using SparseArrays
-function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
-  I,J,V = findnz(A.parent)
-  rowmap, colmap = A.inv_indices
-  for k in eachindex(I)
-    I[k] = rowmap[I[k]]
-    J[k] = colmap[J[k]]
-  end
-  mask = map((i,j) -> (i > 0 && j > 0), I, J)
-  return I[mask], J[mask], V[mask]
-end
-
 ############################################################################################
 
 ids = cids

From 7b3a94fb03b27034a803055721979860871ebc5d Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 5 Nov 2024 23:39:15 +1100
Subject: [PATCH 11/14] MInor

---
 src/Assembly.jl      | 5 +----
 src/PArraysExtras.jl | 6 +++++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 021e8151..a538eaa1 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -270,10 +270,7 @@ function create_from_nz_locally_assembled(
   map(map_local_to_global!,I,test_ids)
   map(map_local_to_global!,J,trial_ids)
 
-  # TODO: replace_ghost or union_ghost?
-  # Actually, do we want to change the ghosts at all? We could return the unpermuted trial_ids
-  J_owners = find_owner(trial_ids,J)
-  cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners)
+  cols = filter_and_replace_ghost(map(unpermute,trial_ids),J)
 
   map(map_global_to_local!,I,rows)
   map(map_global_to_local!,J,cols)
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 05c82163..b139a547 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -102,6 +102,10 @@ end
     filter_and_replace_ghost(indices,gids)
 
 Replace ghost ids in `indices` with the ghost ids within `gids`.
+
+NOTE: The issue is that `replace_ghost` does not check if all gids are ghosts or whether 
+they are repeated. It also shifts ownership of the ghosts. Its all quite messy and not what we 
+would need. TODO: Make me better. 
 """
 function filter_and_replace_ghost(indices,gids)
   owners = find_owner(indices,gids)
@@ -113,7 +117,7 @@ function filter_and_replace_ghost(indices,gids)
 end
 
 # Same as PartitionedArrays.filter_ghost, but we do not exclude ghost indices that 
-# belong to `indices`.
+# belong to `indices`. This could eventually be a flag in the original function.
 function _filter_ghost(indices,gids,owners)
   ghosts = Set{Int}()
   part_owner = part_id(indices)

From 9c5b9682e8ac0d006815b2b995f41c7cfca9f4b1 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Wed, 26 Mar 2025 12:48:56 +1100
Subject: [PATCH 12/14] Saving changes

---
 src/Algebra.jl         |  78 +++++++++++++++++-
 src/Assembly.jl        | 174 +++++++++++++++++++++++++++++++++++------
 src/FESpaces.jl        | 148 -----------------------------------
 src/PArraysExtras.jl   |  20 ++++-
 test/parrays_update.jl |  46 +++++++++--
 5 files changed, 286 insertions(+), 180 deletions(-)

diff --git a/src/Algebra.jl b/src/Algebra.jl
index 3fa0b502..64410c95 100644
--- a/src/Algebra.jl
+++ b/src/Algebra.jl
@@ -68,9 +68,11 @@ end
 # change_ghost
 
 function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T
-  same_partition = (a.index_partition === partition(ids))
-  a_new = same_partition ? a : change_ghost(T,a,ids)
-  if make_consistent && (!same_partition || !is_consistent)
+  if (a.index_partition === partition(ids))
+    return a
+  end
+  a_new = change_ghost(T,a,ids)
+  if make_consistent && !is_consistent
     consistent!(a_new) |> wait
   end
   return a_new
@@ -98,3 +100,73 @@ function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_
   end
   return BlockPVector(vals,ids)
 end
+
+# This type is required in order to be able to access the local portion 
+# of distributed sparse matrices and vectors using local indices from the 
+# distributed test and trial spaces
+struct LocalView{T,N,A} <:AbstractArray{T,N}
+  plids_to_value::A
+  d_to_lid_to_plid::NTuple{N,Vector{Int32}}
+  local_size::NTuple{N,Int}
+  function LocalView(
+    plids_to_value::AbstractArray{T,N}, d_to_lid_to_plid::NTuple{N,Vector{Int32}}
+  ) where {T,N}
+    A = typeof(plids_to_value)
+    local_size = map(length,d_to_lid_to_plid)
+    new{T,N,A}(plids_to_value,d_to_lid_to_plid,local_size)
+  end
+end
+
+Base.size(a::LocalView) = a.local_size
+Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian()
+function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
+  plids = map(getindex,a.d_to_lid_to_plid,lids)
+  if all(i->i>0,plids)
+    a.plids_to_value[plids...]
+  else
+    zero(T)
+  end
+end
+function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N}
+  plids = map(getindex,a.d_to_lid_to_plid,lids)
+  @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion"
+  a.plids_to_value[plids...] = v
+end
+
+function local_views(a::PVector,new_rows::PRange)
+  old_rows = axes(a,1)
+  if partition(old_rows) === partition(new_rows)
+    partition(a)
+  else
+    map(partition(a),partition(old_rows),partition(new_rows)) do a,old_rows,new_rows
+      LocalView(a,(local_to_local_map(new_rows,old_rows),))
+    end
+  end
+end
+
+function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange)
+  old_rows, old_cols = axes(a)
+  if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) )
+    partition(a)
+  else
+    map(
+      partition(a),partition(old_rows),partition(old_cols),partition(new_rows),partition(new_cols)
+    ) do a,old_rows,old_cols,new_rows,new_cols
+      rl2lmap = local_to_local_map(new_rows,old_rows)
+      cl2lmap = local_to_local_map(new_cols,old_cols)
+      LocalView(a,(rl2lmap,cl2lmap))
+    end
+  end
+end
+
+function local_views(a::BlockPVector,new_rows::BlockPRange)
+  vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange)
+  vals = map(CartesianIndices(blocksize(a))) do I
+    local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I])
+  end |> to_parray_of_arrays
+  return map(mortar,vals)
+end
diff --git a/src/Assembly.jl b/src/Assembly.jl
index a538eaa1..165d41c4 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -5,23 +5,36 @@ struct Assembled <: FESpaces.AssemblyStrategy end
 struct SubAssembled <: FESpaces.AssemblyStrategy end
 struct LocallyAssembled <: FESpaces.AssemblyStrategy end
 
+function local_assembly_strategy(::Union{Assembled,SubAssembled},rows,cols)
+  DefaultAssemblyStrategy()
+end
+
+# When LocallyAssembling, make sure that you also loop over ghost cells.
+function local_assembly_strategy(::LocallyAssembled,rows,cols)
+  rows_local_to_ghost = local_to_ghost(rows)
+  GenericAssemblyStrategy(
+    identity,
+    identity,
+    row->iszero(rows_local_to_ghost[row]),
+    col->true
+  )
+end
+
 # PSparseMatrix and PVector builders
 
-struct PSparseMatrixBuilder{T,B}
-  local_matrix_type::Type{T}
+struct DistributedArrayBuilder{T,B}
+  local_array_type::Type{T}
   strategy::B
 end
 
+const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T<:AbstractMatrix,B}
+const PVectorBuilder{T,B} = DistributedArrayBuilder{T<:AbstractVector,B}
+
 function Algebra.get_array_type(::PSparseMatrixBuilder{Tv}) where Tv
   T = eltype(Tv)
   return PSparseMatrix{T}
 end
 
-struct PVectorBuilder{T,B}
-  local_vector_type::Type{T}
-  strategy::B
-end
-
 function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
   T = eltype(Tv)
   return PVector{T}
@@ -53,6 +66,11 @@ Base.axes(a::DistributedCounter) = a.axes
 Base.axes(a::DistributedCounter,d::Integer) = a.axes[d]
 local_views(a::DistributedCounter) = a.counters
 
+function local_views(a::DistributedCounter,axes::PRange...)
+  @check all(map(PArrays.matching_local_indices,axes,a.axes))
+  return a.counters
+end
+
 const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B}
 Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
 
@@ -83,6 +101,11 @@ Base.axes(a::DistributedAllocation) = a.axes
 Base.axes(a::DistributedAllocation,d::Integer) = a.axes[d]
 local_views(a::DistributedAllocation) = a.allocs
 
+function local_views(a::DistributedAllocation,axes::PRange...)
+  @check all(map(PArrays.matching_local_indices,axes,a.axes))
+  return a.allocs
+end
+
 function change_axes(a::DistributedAllocation{S,T,N},axes::NTuple{N,<:PRange}) where {S,T,N}
   indices = map(partition,axes)
   local_axes = map(indices...) do indices...
@@ -151,6 +174,11 @@ function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled})
   return A
 end
 
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:SubAssembled})
+  A, = create_from_nz_subassembled(a)
+  return A
+end
+
 # PVector assembly chain:
 #
 #   1 - nz_counter(PVectorBuilder) -> PVectorCounter
@@ -166,7 +194,7 @@ function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::Tuple{<:PRange}) wh
   DistributedCounter(counters,axs,builder.strategy)
 end
 
-function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled})
+function Arrays.nz_allocation(a::PVectorCounter{<:Union{LocallyAssembled,SubAssembled}})
   allocs = map(nz_allocation,local_views(a))
   DistributedAllocation(allocs,a.axes,a.strategy)
 end
@@ -188,14 +216,14 @@ function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVecto
   @assert false
 end
 
-function Algebra.create_from_nz(a::PVectorAllocation{<:LocallyAssembled,<:AbstractVector})
+function Algebra.create_from_nz(a::PVectorAllocation{<:Union{LocallyAssembled,SubAssembled}})
   rows = remove_ghost(unpermute(axes(a,1)))
   values = local_views(a)
   b = rhs_callback(values,rows)
   return b
 end
 
-function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
+function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled})
   rows = collect_touched_ids(a)
   values = map(ai -> ai.values, local_views(a))
   b  = rhs_callback(values,rows)
@@ -215,23 +243,19 @@ end
 # This is done by using the following specializations:
 
 function Arrays.nz_allocation(
-  a::PSparseMatrixCounter{<:Assembled},
-  b::PVectorCounter{<:Assembled}
+  a::PSparseMatrixCounter, b::PVectorCounter
 )
-  A = nz_allocation(a) # PSparseMatrixAllocation
-  B = nz_allocation(b) # PVectorAllocation{<:Assembled,<:TrackedArrayAllocation}
-  return A, B
+  return nz_allocation(a), nz_allocation(b)
 end
 
 function Algebra.create_from_nz(
   a::PSparseMatrixAllocation{<:LocallyAssembled},
-  b::PVectorAllocation{<:LocallyAssembled,<:AbstractVector}
+  b::PVectorAllocation{<:LocallyAssembled}
 )
   function callback(rows)
-    new_indices = partition(rows)
     values = local_views(b)
     b_fespace = PVector(values,partition(axes(b,1)))
-    locally_repartition(b_fespace,new_indices)
+    locally_repartition(b_fespace,rows)
   end
   A, B = create_from_nz_locally_assembled(a,callback)
   return A, B
@@ -239,7 +263,7 @@ end
 
 function Algebra.create_from_nz(
   a::PSparseMatrixAllocation{<:Assembled},
-  b::PVectorAllocation{<:Assembled,<:TrackedArrayAllocation}
+  b::PVectorAllocation{<:Assembled}
 )
   function callback(rows)
     new_indices = collect_touched_ids(b)
@@ -254,6 +278,19 @@ function Algebra.create_from_nz(
   return A, B
 end
 
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:SubAssembled},
+  b::PVectorAllocation{<:SubAssembled}
+)
+  function callback(rows)
+    @check matching_local_indices(rows,axes(b,1))
+    values = local_views(b)
+    PVector(values,partition(axes(b,1)))
+  end
+  A, B = create_from_nz_subassembled(a,callback)
+  return A, B
+end
+
 # Low-level assembly methods
 
 function create_from_nz_locally_assembled(
@@ -321,18 +358,111 @@ end
 function create_from_nz_subassembled(
   a,
   callback::Function = rows -> nothing,
-  async_callback::Function = b -> empty_async_task
 )
   rows = partition(axes(a,1))
   cols = partition(axes(a,2))
 
   b = callback(rows)
-  t2 = async_callback(b)
 
   assembled = false
   values = map(create_from_nz,local_views(a))
   A = PSparseMatrix(values,rows,cols,assembled)
 
-  wait(t2)
   return A, b
 end
+
+# DistributedSparseMatrixAssemblers
+
+"""
+"""
+struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
+  strategy::A
+  assems::B
+  matrix_builder::C
+  vector_builder::D
+  test_gids::E
+  trial_gids::F
+end
+
+local_views(a::DistributedSparseMatrixAssembler) = a.assems
+
+FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
+FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
+FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
+FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
+FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  rows::PRange,
+  cols::PRange,
+  par_strategy=Assembled()
+)
+  assems = map(partition(rows),partition(cols)) do rows,cols
+    FESpaces.GenericSparseMatrixAssembler(
+      SparseMatrixBuilder(local_mat_type), 
+      ArrayBuilder(local_vec_type),
+      Base.OneTo(length(rows)),
+      Base.OneTo(length(cols)),
+      local_assembly_strategy(par_strategy,rows,cols)
+    )
+  end
+  mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy)
+  vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy)
+  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  rows = get_free_dof_ids(test)
+  cols = get_free_dof_ids(trial)
+  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  Tv = getany(map(get_vector_type,local_views(trial)))
+  Tm = SparseMatrixCSC{eltype(Tv),Int}
+  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
+end
+
+function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)  
+end
+
+function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
+end
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 1fb12343..4724789b 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -582,154 +582,6 @@ function FESpaces.collect_cell_matrix_and_vector(
   end
 end
 
-"""
-"""
-struct DistributedSparseMatrixAssembler{A,B,C,D,E,F,G} <: SparseMatrixAssembler
-  strategy::A
-  assems::B
-  matrix_builder::C
-  vector_builder::D
-  test_gids::E
-  trial_gids::F
-  caches::G
-end
-
-local_views(a::DistributedSparseMatrixAssembler) = a.assems
-
-FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
-FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
-FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
-FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
-FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
-
-function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  map(symbolic_loop_matrix!,local_views(A),local_views(a),matdata)
-end
-
-function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  map(numeric_loop_matrix!,local_views(A),local_views(a),matdata)
-end
-
-function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  Aviews=local_views(A,rows,cols)
-  bviews=local_views(b,rows)
-  aviews=local_views(a)
-  map(symbolic_loop_matrix_and_vector!,Aviews,bviews,aviews,data)  
-end
-
-function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
-end
-
-function FESpaces.assemble_matrix!(mat::PSparseMatrix,a::DistributedSparseMatrixAssembler,matdata)
-  @check !isnothing(a.caches) "Assembler is not reusable!"
-  @check haskey(a.caches,objectid(mat)) "Cache not found!"
-  cache = a.caches[objectid(mat)]
-
-  LinearAlgebra.fillstored!(mat,zero(eltype(mat)))
-
-  counter = nz_counter(get_matrix_builder(a),(get_rows(a),get_cols(a)))
-  map(local_views(counter),partition(mat)) do counter, mat
-    counter.nnz = nnz(mat)
-  end
-  allocs = nz_allocation(counter)
-  numeric_loop_matrix!(allocs,a,matdata)
-
-  create_from_nz!(mat,allocs,cache)
-end
-
-function create_from_nz!(A,a::DistributedAllocation{<:Assembled},cache)
-  _,_,V = get_allocations(a)
-  psparse!(A,V,cache) |> wait
-  return A
-end
-
-# Parallel Assembly strategies
-
-function local_assembly_strategy(::Assembled,rows,cols)
-  DefaultAssemblyStrategy()
-end
-
-# When using this one, make sure that you also loop over ghost cells.
-# This is at your own risk.
-function local_assembly_strategy(::LocallyAssembled,rows,cols)
-  rows_local_to_ghost = local_to_ghost(rows)
-  GenericAssemblyStrategy(
-    identity,
-    identity,
-    row->iszero(rows_local_to_ghost[row]),
-    col->true
-  )
-end
-
-# Assembler high level constructors
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  rows::PRange,
-  cols::PRange,
-  par_strategy=Assembled();
-  reuse_caches=false
-)
-  assems = map(partition(rows),partition(cols)) do rows,cols
-    local_strategy = local_assembly_strategy(par_strategy,rows,cols)
-    FESpaces.GenericSparseMatrixAssembler(
-      SparseMatrixBuilder(local_mat_type),
-      ArrayBuilder(local_vec_type),
-      Base.OneTo(length(rows)),
-      Base.OneTo(length(cols)),
-      local_strategy
-    )
-  end
-  mat_builder = PSparseMatrixBuilder(local_mat_type,par_strategy)
-  vec_builder = PVectorBuilder(local_vec_type,par_strategy)
-  if reuse_caches
-    caches = Dict{UInt64,Any}()
-  else
-    caches = nothing
-  end
-  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols,caches)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy::FESpaces.AssemblyStrategy=Assembled();
-  kwargs...
-)
-  rows = get_free_dof_ids(test)
-  cols = get_free_dof_ids(trial)
-  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy::FESpaces.AssemblyStrategy=Assembled();
-  kwargs...
-)
-  Tv = PartitionedArrays.getany(map(get_vector_type,local_views(trial)))
-  T  = eltype(Tv)
-  Tm = SparseMatrixCSC{T,Int}
-  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
-end
-
 # ZeroMean FESpace
 struct DistributedZeroMeanCache{A,B}
   dvol::A
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index b139a547..df09bdb0 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -153,6 +153,25 @@ function _filter_ghost(indices,gids,owners)
   return new_ghost_to_global, new_ghost_to_owner
 end
 
+# This function computes a mapping among the local identifiers of a and b
+# for which the corresponding global identifiers are both in a and b. 
+# Note that the haskey check is necessary because in the general case 
+# there might be gids in b which are not present in a
+function local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
+  a_lid_to_b_lid = fill(zero(Int32),local_length(a))
+
+  b_local_to_global  = local_to_global(b)
+  a_global_to_local  = global_to_local(a)
+  for b_lid in 1:local_length(b)
+    gid = b_local_to_global[b_lid]
+    a_lid = a_global_to_local[gid]
+    if !iszero(a_lid)
+      a_lid_to_b_lid[a_lid] = b_lid
+    end  
+  end
+  a_lid_to_b_lid
+end
+
 # SubSparseMatrix extensions
 
 function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
@@ -362,4 +381,3 @@ end
 function generate_subparts(parts::DebugArray,new_comm_size)
   DebugArray(LinearIndices((new_comm_size,)))
 end
-
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index fd62332b..8516150e 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -3,6 +3,8 @@ using Gridap
 using PartitionedArrays
 using GridapDistributed
 
+using Gridap.FESpaces, Gridap.Algebra
+
 np = (1,2)
 ranks = with_debug() do distribute
   distribute(LinearIndices((prod(np),)))
@@ -29,14 +31,46 @@ serial_dΩ = Measure(serial_Ω,2)
 dist_Ω = Triangulation(dist_model)
 dist_dΩ = Measure(dist_Ω,2)
 
-serial_a1(u,v) = ∫(u⋅v)*serial_dΩ
-serial_A1 = assemble_matrix(serial_a1,serial_V,serial_V)
+dist_Ωg = Triangulation(GridapDistributed.with_ghost,dist_model)
+dist_dΩg = Measure(dist_Ωg,2)
+
+serial_a(u,v) = ∫(u⋅v)*serial_dΩ
+dist_a(u,v) = ∫(u⋅v)*dist_dΩ
+dist_ag(u,v) = ∫(u⋅v)*dist_dΩg
+
+serial_A = assemble_matrix(serial_a,serial_V,serial_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.Assembled())
+dist_A_AS = assemble_matrix(dist_a,assem,dist_V,dist_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.LocallyAssembled())
+dist_A_LA = assemble_matrix(dist_ag,assem,dist_V,dist_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.SubAssembled())
+dist_A_SA = assemble_matrix(dist_a,assem,dist_V,dist_V)
+
+all(centralize(dist_A_AS) - serial_A .< 1e-10)
+
+x_AS = prand(partition(axes(dist_A_AS,2)))
+x_LA = GridapDistributed.change_ghost(x_AS,axes(dist_A_LA,2))
+x_SA = GridapDistributed.change_ghost(x_AS,axes(dist_A_SA,2))
+
+norm(dist_A_AS*x_AS - dist_A_LA*x_LA)
+norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
+
+assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V)
+
+############################################################################################
+
+A = deepcopy(dist_A_AS)
+
+t = assemble!(A;reuse=true)
+_, cache = fetch(t)
 
-dist_a1(u,v) = ∫(u⋅v)*dist_dΩ
-assem = SparseMatrixAssembler(dist_V,dist_V)
-dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V)
-all(centralize(dist_A1) - serial_A1 .< 1e-10)
+t2 = assemble!(A,cache)
+wait(t2)
 
+map(==, partition(A), partition(dist_A_AS))
 
 ############################################################################################
 

From eef271e37b795fba23f11f4a9b64849f7c880319 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Mon, 31 Mar 2025 07:53:26 +1100
Subject: [PATCH 13/14] saving changes

---
 src/Assembly.jl        | 114 +++----------------
 src/FESpaces.jl        | 248 ++++++++++++++++++++++++++++-------------
 test/parrays_update.jl |  59 +---------
 3 files changed, 185 insertions(+), 236 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 165d41c4..1e0127bd 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -22,16 +22,24 @@ end
 
 # PSparseMatrix and PVector builders
 
-struct DistributedArrayBuilder{T,B}
+struct DistributedArrayBuilder{T,N,B}
   local_array_type::Type{T}
   strategy::B
+  function DistributedArrayBuilder(
+    local_array_type::Type{T},
+    strategy::AssemblyStrategy
+  ) where T
+    N = ndims(T)
+    B = typeof(strategy)
+    new{T,N,B}(local_array_type,strategy)
+  end
 end
 
-const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T<:AbstractMatrix,B}
-const PVectorBuilder{T,B} = DistributedArrayBuilder{T<:AbstractVector,B}
+const PVectorBuilder{T,B} = DistributedArrayBuilder{T,1,B}
+const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T,2,B}
 
-function Algebra.get_array_type(::PSparseMatrixBuilder{Tv}) where Tv
-  T = eltype(Tv)
+function Algebra.get_array_type(::PSparseMatrixBuilder{Tm}) where Tm
+  T = eltype(Tm)
   return PSparseMatrix{T}
 end
 
@@ -370,99 +378,3 @@ function create_from_nz_subassembled(
 
   return A, b
 end
-
-# DistributedSparseMatrixAssemblers
-
-"""
-"""
-struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
-  strategy::A
-  assems::B
-  matrix_builder::C
-  vector_builder::D
-  test_gids::E
-  trial_gids::F
-end
-
-local_views(a::DistributedSparseMatrixAssembler) = a.assems
-
-FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
-FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
-FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
-FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
-FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
-
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  rows::PRange,
-  cols::PRange,
-  par_strategy=Assembled()
-)
-  assems = map(partition(rows),partition(cols)) do rows,cols
-    FESpaces.GenericSparseMatrixAssembler(
-      SparseMatrixBuilder(local_mat_type), 
-      ArrayBuilder(local_vec_type),
-      Base.OneTo(length(rows)),
-      Base.OneTo(length(cols)),
-      local_assembly_strategy(par_strategy,rows,cols)
-    )
-  end
-  mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy)
-  vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy)
-  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy::FESpaces.AssemblyStrategy=Assembled();
-  kwargs...
-)
-  rows = get_free_dof_ids(test)
-  cols = get_free_dof_ids(trial)
-  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy::FESpaces.AssemblyStrategy=Assembled();
-  kwargs...
-)
-  Tv = getany(map(get_vector_type,local_views(trial)))
-  Tm = SparseMatrixCSC{eltype(Tv),Int}
-  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
-end
-
-function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows, cols = get_rows(a), get_cols(a)
-  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
-end
-
-function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows, cols = get_rows(a), get_cols(a)
-  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
-end
-
-function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows, cols = get_rows(a), get_cols(a)
-  map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)  
-end
-
-function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows, cols = get_rows(a), get_cols(a)
-  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
-end
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 4724789b..3121e118 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -505,83 +505,6 @@ function _add_distributed_constraint(F::DistributedFESpace,order::Integer,constr
   V
 end
 
-# Assembly
-
-function FESpaces.collect_cell_matrix(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  a::DistributedDomainContribution
-)
-  map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a))
-end
-
-function FESpaces.collect_cell_vector(
-  test::DistributedFESpace, a::DistributedDomainContribution
-)
-  map(collect_cell_vector,local_views(test),local_views(a))
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution
-)
-  map(collect_cell_matrix_and_vector,
-    local_views(trial),
-    local_views(test),
-    local_views(biform),
-    local_views(liform)
-  )
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution,
-  uhd
-)
-  map(collect_cell_matrix_and_vector,
-    local_views(trial),
-    local_views(test),
-    local_views(biform),
-    local_views(liform),
-    local_views(uhd)
-  )
-end
-
-function FESpaces.collect_cell_vector(
-  test::DistributedFESpace,l::Number
-)
-  map(local_views(test)) do s
-    collect_cell_vector(s,l)
-  end
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  mat::DistributedDomainContribution,
-  l::Number
-)
-  map(local_views(trial),local_views(test),local_views(mat)) do u,v,m
-    collect_cell_matrix_and_vector(u,v,m,l)
-  end
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  mat::DistributedDomainContribution,
-  l::Number,
-  uhd
-)
-  map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f
-    collect_cell_matrix_and_vector(u,v,m,l,f)
-  end
-end
-
 # ZeroMean FESpace
 struct DistributedZeroMeanCache{A,B}
   dvol::A
@@ -741,3 +664,174 @@ function FESpaces.ConstantFESpace(
   vector_type = _find_vector_type(spaces,gids)
   return DistributedSingleFieldFESpace(spaces,gids,trian,vector_type)
 end
+
+# Assembly
+
+"""
+"""
+struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
+  strategy::A
+  assems::B
+  matrix_builder::C
+  vector_builder::D
+  test_gids::E
+  trial_gids::F
+end
+
+local_views(a::DistributedSparseMatrixAssembler) = a.assems
+
+FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
+FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
+FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
+FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
+FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  rows::PRange,
+  cols::PRange,
+  par_strategy=Assembled()
+)
+  assems = map(partition(rows),partition(cols)) do rows,cols
+    FESpaces.GenericSparseMatrixAssembler(
+      SparseMatrixBuilder(local_mat_type), 
+      ArrayBuilder(local_vec_type),
+      Base.OneTo(length(rows)),
+      Base.OneTo(length(cols)),
+      local_assembly_strategy(par_strategy,rows,cols)
+    )
+  end
+  mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy)
+  vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy)
+  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  rows = get_free_dof_ids(test)
+  cols = get_free_dof_ids(trial)
+  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  Tv = getany(map(get_vector_type,local_views(trial)))
+  Tm = SparseMatrixCSC{eltype(Tv),Int}
+  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
+end
+
+function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)  
+end
+
+function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
+end
+
+function FESpaces.collect_cell_matrix(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  a::DistributedDomainContribution
+)
+  map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a))
+end
+
+function FESpaces.collect_cell_vector(
+  test::DistributedFESpace, a::DistributedDomainContribution
+)
+  map(collect_cell_vector,local_views(test),local_views(a))
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  biform::DistributedDomainContribution,
+  liform::DistributedDomainContribution
+)
+  map(collect_cell_matrix_and_vector,
+    local_views(trial),
+    local_views(test),
+    local_views(biform),
+    local_views(liform)
+  )
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  biform::DistributedDomainContribution,
+  liform::DistributedDomainContribution,
+  uhd
+)
+  map(collect_cell_matrix_and_vector,
+    local_views(trial),
+    local_views(test),
+    local_views(biform),
+    local_views(liform),
+    local_views(uhd)
+  )
+end
+
+function FESpaces.collect_cell_vector(
+  test::DistributedFESpace,l::Number
+)
+  map(local_views(test)) do s
+    collect_cell_vector(s,l)
+  end
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  mat::DistributedDomainContribution,
+  l::Number
+)
+  map(local_views(trial),local_views(test),local_views(mat)) do u,v,m
+    collect_cell_matrix_and_vector(u,v,m,l)
+  end
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  mat::DistributedDomainContribution,
+  l::Number,
+  uhd
+)
+  map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f
+    collect_cell_matrix_and_vector(u,v,m,l,f)
+  end
+end
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index 8516150e..961cbfcd 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -59,64 +59,7 @@ norm(dist_A_AS*x_AS - dist_A_LA*x_LA)
 norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
 
 assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V)
+norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
 
 ############################################################################################
 
-A = deepcopy(dist_A_AS)
-
-t = assemble!(A;reuse=true)
-_, cache = fetch(t)
-
-t2 = assemble!(A,cache)
-wait(t2)
-
-map(==, partition(A), partition(dist_A_AS))
-
-############################################################################################
-
-function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
-  K = zeros(Int32,length(I))
-  for (p,(i,j)) in enumerate(zip(I,J))
-      if !skip && (i < 1 || j < 1)
-          continue
-      end
-      K[p] = nzindex(A,i,j)
-  end
-  K
-end
-
-############################################################################################
-
-ids = cids
-indices = partition(ids)
-
-n_own = own_length(cids)
-ghost2global = ghost_to_global(cids)
-ghost2owner = ghost_to_owner(cids)
-
-first_gid = scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
-n_global = reduce(+,n_own)
-n_parts = length(ranks)
-indices2 = map(ranks,n_own,first_gid,ghost2global,ghost2owner) do rank,n_own,start,ghost2global,ghost2owner
-  p = CartesianIndex((rank,))
-  np = (n_parts,)
-  n = (n_global,)
-  ranges = ((1:n_own).+(start-1),)
-  ghost = GhostIndices(n_global,ghost2global,ghost2owner)
-
-  PartitionedArrays.LocalIndicesWithVariableBlockSize(p,np,n,ranges,ghost)
-end
-
-map(indices,indices2) do indices, indices2
-  ghost2local = ghost_to_local(indices)
-  own2local = own_to_local(indices)
-
-  n_own = own_length(indices)
-  n_ghost = ghost_length(indices)
-
-  #perm = vcat(own2local,ghost2local)
-  perm = fill(0,local_length(indices))
-  perm[own2local] .= 1:n_own
-  perm[ghost2local] .= (n_own+1):(n_ghost+n_own)
-  permute_indices(indices2,perm)
-end

From cb9de8ec348a09d14670b606e5dda3918177c71e Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Mon, 31 Mar 2025 10:13:45 +1100
Subject: [PATCH 14/14] Bugfixes

---
 src/Assembly.jl          | 19 ++++++++++---------
 src/FESpaces.jl          |  3 +--
 src/PArraysExtras.jl     | 19 ++++++++++++++++++-
 test/PeriodicBCsTests.jl |  2 ++
 test/parrays_update.jl   |  1 -
 5 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 1e0127bd..41673c5e 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -133,18 +133,19 @@ function get_allocations(a::PSparseMatrixAllocation{S,<:Algebra.AllocationCOO})
   return I,J,V
 end
 
-function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation})
-  map(local_views(a),partition(axes(a,1))) do a, ids
+function collect_touched_ids(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
+  touched_ids = map(local_views(a),partition(axes(a,1))) do a, ids
+    n_global = global_length(ids)
     rows = remove_ghost(unpermute(ids))
 
     ghost_lids = ghost_to_local(ids)
-    ghost_owners = ghost_to_owner(ids)
-    touched_ghost_lids = filter(lid -> a.touched[lid], ghost_lids)
-    touched_ghost_owners = collect(ghost_owners[touched_ghost_lids])
+    touched_ghost_lids = collect(Int,filter(lid -> a.touched[lid], ghost_lids))
+    touched_ghost_owners = local_to_owner(ids)[touched_ghost_lids]
     touched_ghost_gids = to_global!(touched_ghost_lids, ids)
     ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners)
-    return replace_ghost(rows,ghost)
+    replace_ghost(rows,ghost)
   end
+  return touched_ids
 end
 
 # PSparseMatrix assembly chain
@@ -193,13 +194,13 @@ end
 #   2 - nz_allocation(PVectorCounter) -> PVectorAllocation
 #   3 - create_from_nz(PVectorAllocation) -> PVector
 
-function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::Tuple{<:PRange}) where VT
+function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::NTuple{1,<:PRange}) where VT
   rows, = axs
   counters = map(partition(rows)) do rows
     axs = (Base.OneTo(local_length(rows)),)
     nz_counter(ArrayBuilder(VT),axs)
   end
-  DistributedCounter(counters,axs,builder.strategy)
+  DistributedCounter(counters,(rows,),builder.strategy)
 end
 
 function Arrays.nz_allocation(a::PVectorCounter{<:Union{LocallyAssembled,SubAssembled}})
@@ -291,7 +292,7 @@ function Algebra.create_from_nz(
   b::PVectorAllocation{<:SubAssembled}
 )
   function callback(rows)
-    @check matching_local_indices(rows,axes(b,1))
+    @check PArrays.matching_local_indices(PRange(rows),axes(b,1))
     values = local_views(b)
     PVector(values,partition(axes(b,1)))
   end
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 3121e118..f68b7470 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -168,8 +168,7 @@ function generate_gids(
       cell_owner = cell_local_to_owner[cell]
       p = cell_to_gdofs.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
-        dof_owner = ldof_to_owner[ldof]
-        if ldof > 0 && (dof_owner == cell_owner)
+        if ldof > 0 && isequal(ldof_to_owner[ldof],cell_owner)
           ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p]
         end
       end
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index df09bdb0..302944a6 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -3,7 +3,7 @@
     permuted_variable_partition(n_own,gids,owners; n_global, start)
 
 Create indices which are a permuted version of a variable_partition.
-The advantage of this wrt the `LocalIndices` type, is that we can compute 
+The advantage of this w.r.t. the `LocalIndices` type, is that we can compute 
 dof owners with minimal communications, since we only need the size of the blocks.
 
 NOTE: This is the type for our FESpace dof_ids.
@@ -153,6 +153,23 @@ function _filter_ghost(indices,gids,owners)
   return new_ghost_to_global, new_ghost_to_owner
 end
 
+# function PArrays.remove_ghost(indices::PermutedLocalIndices)
+#   n_global = global_length(indices) 
+#   own = OwnIndices(n_global,part_id(indices),own_to_global(indices))
+#   ghost = GhostIndices(n_global,Int[],Int32[])
+#   OwnAndGhostIndices(own,ghost)
+# end
+
+function PArrays.remove_ghost(indices::PermutedLocalIndices)
+  perm = indices.perm
+  own_to_local = indices.own_to_local
+  display(own_to_local)
+  println(issorted(own_to_local))
+  display(perm)
+  println("-------------------------------")
+  remove_ghost(indices.indices)
+end
+
 # This function computes a mapping among the local identifiers of a and b
 # for which the corresponding global identifiers are both in a and b. 
 # Note that the haskey check is necessary because in the general case 
diff --git a/test/PeriodicBCsTests.jl b/test/PeriodicBCsTests.jl
index 6177752d..247cb6fc 100644
--- a/test/PeriodicBCsTests.jl
+++ b/test/PeriodicBCsTests.jl
@@ -35,4 +35,6 @@ function main(distribute,parts)
 
 end
 
+main(DebugArray,(2,2))
+
 end # module
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index 961cbfcd..ec106730 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -62,4 +62,3 @@ assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V)
 norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
 
 ############################################################################################
-