From 37ac5c8121bab5f0127134113603371b9c6a5b96 Mon Sep 17 00:00:00 2001 From: Xi Luo Date: Mon, 14 Oct 2019 16:34:14 -0400 Subject: [PATCH 1/7] Solo: a shared memory collective module --- ompi/mca/coll/solo/Makefile.am | 43 +++ ompi/mca/coll/solo/coll_solo.h | 190 ++++++++++++++ ompi/mca/coll/solo/coll_solo_allreduce.c | 274 +++++++++++++++++++ ompi/mca/coll/solo/coll_solo_barrier.c | 37 +++ ompi/mca/coll/solo/coll_solo_bcast.c | 147 +++++++++++ ompi/mca/coll/solo/coll_solo_component.c | 148 +++++++++++ ompi/mca/coll/solo/coll_solo_module.c | 320 +++++++++++++++++++++++ ompi/mca/coll/solo/coll_solo_mpool.c | 233 +++++++++++++++++ ompi/mca/coll/solo/coll_solo_mpool.h | 96 +++++++ ompi/mca/coll/solo/coll_solo_reduce.c | 279 ++++++++++++++++++++ 10 files changed, 1767 insertions(+) create mode 100644 ompi/mca/coll/solo/Makefile.am create mode 100644 ompi/mca/coll/solo/coll_solo.h create mode 100644 ompi/mca/coll/solo/coll_solo_allreduce.c create mode 100644 ompi/mca/coll/solo/coll_solo_barrier.c create mode 100644 ompi/mca/coll/solo/coll_solo_bcast.c create mode 100644 ompi/mca/coll/solo/coll_solo_component.c create mode 100644 ompi/mca/coll/solo/coll_solo_module.c create mode 100644 ompi/mca/coll/solo/coll_solo_mpool.c create mode 100644 ompi/mca/coll/solo/coll_solo_mpool.h create mode 100644 ompi/mca/coll/solo/coll_solo_reduce.c diff --git a/ompi/mca/coll/solo/Makefile.am b/ompi/mca/coll/solo/Makefile.am new file mode 100644 index 00000000000..36f095efa86 --- /dev/null +++ b/ompi/mca/coll/solo/Makefile.am @@ -0,0 +1,43 @@ +# +# Copyright (c) 2019 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + coll_solo.h \ + coll_solo_mpool.h \ + coll_solo_barrier.c \ + coll_solo_reduce.c \ + coll_solo_bcast.c \ + coll_solo_allreduce.c \ + coll_solo_component.c \ + coll_solo_module.c \ + coll_solo_mpool.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +component_noinst = +component_install = +if MCA_BUILD_ompi_coll_solo_DSO +component_install += mca_coll_solo.la +else +component_noinst += libmca_coll_solo.la +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_coll_solo_la_SOURCES = $(sources) +mca_coll_solo_la_LDFLAGS = -module -avoid-version +mca_coll_solo_la_LIBADD = + +noinst_LTLIBRARIES = $(component_noinst) +libmca_coll_solo_la_SOURCES =$(sources) +libmca_coll_solo_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/coll/solo/coll_solo.h b/ompi/mca/coll/solo/coll_solo.h new file mode 100644 index 00000000000..4d64c63cde0 --- /dev/null +++ b/ompi/mca/coll/solo/coll_solo.h @@ -0,0 +1,190 @@ +/** + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_SOLO_EXPORT_H +#define MCA_COLL_SOLO_EXPORT_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/communicator/communicator.h" +#include "ompi/win/win.h" +#include "ompi/include/mpi.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "opal/util/info.h" +#include "ompi/op/op.h" +#include "opal/runtime/opal_progress.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "coll_solo_mpool.h" + +BEGIN_C_DECLS +/** + * Structure to hold the solo coll component. First it holds the base coll component, and then + * holds a bunch of solo-coll-component-specific stuff (e.g., current MCA param values). + */ + typedef struct mca_coll_solo_component_t { + /* Base coll component */ + mca_coll_base_component_2_0_0_t super; + + /* MCA parameters */ + /* Priority of the solo module */ + int solo_priority; + /* The size of data_bufs in the static_win */ + uint32_t static_block_size; + uint32_t mpool_small_block_size; + uint32_t mpool_small_block_num; + uint32_t mpool_large_block_size; + uint32_t mpool_large_block_num; + + /* Shared memory pool */ + mca_coll_solo_mpool_t *solo_mpool; +} mca_coll_solo_component_t; + +/* Coll solo module */ +typedef struct mca_coll_solo_module_t { + /* Base module */ + mca_coll_base_module_t super; + + /* Whether this module has been lazily initialized or not yet */ + bool enabled; + + /** + * osc alrogithms attach memory blocks to this bynamic window and use it to perform one-sided + * communications. + */ + MPI_Win dynamic_win; + + /** + * This window is created by ompi_win_allocate_shared such that each process contains a shared + * memory data buffer, and this data buffer is divided into two parts - ctrl_bufs and data_bufs. + */ + MPI_Win static_win; + /** + * The first 4 * opal_cache_line_size bytes in the shared memory data buffer in static_win, used + * to store control messages. + */ + char **ctrl_bufs; + /** + * The rest of the shared memory data buffer in static_win, which is intent to be used to + * tranfer very small messages. Its size is set by static_block_size. + */ + char **data_bufs; + + /* Identify which ctrl_buf is currently used in mac_coll_solo_barrier_intra. */ + int barrier_tag; +} mca_coll_solo_module_t; +OBJ_CLASS_DECLARATION(mca_coll_solo_module_t); + +/** + * Global component instance + */ +OMPI_MODULE_DECLSPEC extern mca_coll_solo_component_t mca_coll_solo_component; + +/** + * coll module functions + */ +int mca_coll_solo_init_query(bool enable_progress_threads, bool enable_mpi_threads); + +mca_coll_base_module_t *mca_coll_solo_comm_query(struct ompi_communicator_t *comm, int *priority); + +/* Lazily enable a module (since it involves expensive memory allocation, etc.) */ +int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm); + +/* Attach a memory block to the dynamic_win of a communicator */ +char **mca_coll_solo_attach_buf(mca_coll_solo_module_t * solo_module, + struct ompi_communicator_t *comm, + char *local_buf, + size_t local_buf_size); + +/* Detach a memory block from the dynamic_win of a communicator */ +void mca_coll_solo_detach_buf(mca_coll_solo_module_t * solo_module, + struct ompi_communicator_t *comm, + char *local_buf, + char ***attached_bufs); + +/* Setup and initialize the static_win of a communicator */ +void mca_coll_solo_setup_static_win(mca_coll_solo_module_t *solo_module, + struct ompi_communicator_t *comm, + size_t data_buf_size); + +/* MPI_Barrier algorithms */ +int mac_coll_solo_barrier_intra(struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +/* MPI_Bcast algorithms */ +int mca_coll_solo_bcast_intra(void *buff, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +/* MPI_Reduce algorithms */ +int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +int mca_coll_solo_reduce_ring_intra(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t + *comm, mca_coll_base_module_t * module); + +int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +/* MPI_Allreduce algorithms */ +int mca_coll_solo_allreduce_intra(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); +END_C_DECLS +#endif /* MCA_COLL_SOLO_EXPORT_H */ diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c new file mode 100644 index 00000000000..383d28b66d6 --- /dev/null +++ b/ompi/mca/coll/solo/coll_solo_allreduce.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_solo.h" + +int mca_coll_solo_allreduce_intra(const void *sbuf, void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) { + mca_coll_solo_allreduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, comm, module); + } + else { + mca_coll_solo_allreduce_ring_intra_osc(sbuf, rbuf, count, dtype, op, comm, module); + } + return OMPI_SUCCESS; + +} + + +/** + * Each process operates a part of the shared data buffer in turn. + * Suppose the number of processes is 4. + * Step 1: + * | P0 | P1 | P2 | P3 | + * Step 2: + * | P1 | P2 | P3 | P0 | + * Step 3: + * | P2 | P3 | P0 | P1 | + * Step 4: + * | P3 | P0 | P1 | P2 | + * At last, all the processes copy data back from the shared data buffer. + */ +int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + int size = ompi_comm_size(comm); + int rank = ompi_comm_rank(comm); + int i; + ptrdiff_t extent, lower_bound; + ompi_datatype_get_extent(dtype, &lower_bound, &extent); + + /* Enable solo module if necessary */ + if (!solo_module->enabled) { + mca_coll_solo_lazy_enable(module, comm); + } + + char **data_bufs = NULL; + int *ids = NULL; + if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + data_bufs = solo_module->data_bufs; + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + data_bufs = (char **) malloc(sizeof(char *) * size); + ids = (int *) malloc(sizeof(int) * size); + ids[rank] = + mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); + + ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0, + MPI_DATATYPE_NULL, + ids, + 1, MPI_INT, comm, + (mca_coll_base_module_t *) + solo_module); + for (i = 0; i < size; i++) { + data_bufs[i] = + mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i], + count * extent); + } + } else { + //printf("TOO BIG\n"); + } + + /* Set up segment count */ + int seg_count, l_seg_count; + seg_count = count / size; + l_seg_count = seg_count; + if (rank == size - 1) { + seg_count = count - rank * l_seg_count; + } + //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); + *(int *) (solo_module->ctrl_bufs[rank]) = rank; + //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); + mac_coll_solo_barrier_intra(comm, module); + + int cur = rank; + for (i = 0; i < size; i++) { + if (cur != size - 1) { + seg_count = l_seg_count; + } else { + seg_count = count - cur * l_seg_count; + } + /* At first iteration, copy local data to the solo data buffer */ + if (cur == rank) { + //cur_win->w_osc_module->osc_fence(0, cur_win); + memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent); + //cur_win->w_osc_module->osc_fence(0, cur_win); + mac_coll_solo_barrier_intra(comm, module); + + } + /* For other iterations, do operations on the solo data buffer */ + else { + ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent, + data_bufs[cur], seg_count, dtype); + //cur_win->w_osc_module->osc_fence(0,cur_win); + mac_coll_solo_barrier_intra(comm, module); + } + cur = (cur - 1 + size) % size; + *(int *) (solo_module->ctrl_bufs[rank]) = + (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size; + //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); + mac_coll_solo_barrier_intra(comm, module); + + } + /* At last, all the processes copy data from the solo data buffer */ + char *c; + c = rbuf; + for (i = 0; i < size; i++) { + if (i != size - 1) { + seg_count = l_seg_count; + } else { + seg_count = count - i * l_seg_count; + } + memcpy((char *) c, data_bufs[i], seg_count * extent); + c = c + seg_count * extent; + } + //cur_win->w_osc_module->osc_fence(0, cur_win); + mac_coll_solo_barrier_intra(comm, module); + if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + ; + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[rank], + count * extent); + if (ids != NULL) { + free(ids); + ids = NULL; + } + + if (data_bufs != NULL) { + free(data_bufs); + data_bufs = NULL; + } + + } else { + //printf("TOO BIG\n"); + } + + + return OMPI_SUCCESS; +} + +int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + int size = ompi_comm_size(comm); + int rank = ompi_comm_rank(comm); + int i; + ptrdiff_t extent, lower_bound; + ompi_datatype_get_extent(dtype, &lower_bound, &extent); + + /* Enable solo module if necessary */ + if (!solo_module->enabled) { + mca_coll_solo_lazy_enable(module, comm); + } + char **data_bufs = NULL; + int id; + MPI_Win cur_win; + char *local_buf = NULL; + if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + data_bufs = (char **) malloc(sizeof(char *) * size); + for (i = 0; i < size; i++) { + data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size; + } + cur_win = solo_module->static_win; + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); + local_buf = + mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, + count * extent); + data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent); + cur_win = solo_module->dynamic_win; + } else { + //printf("TOO BIG\n"); + } + + /* Set up segment count */ + int seg_count, l_seg_count; + seg_count = count / size; + l_seg_count = seg_count; + if (rank == size - 1) { + seg_count = count - rank * l_seg_count; + } + //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); + *(int *) (solo_module->ctrl_bufs[rank]) = rank; + //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); + mac_coll_solo_barrier_intra(comm, module); + + int cur = rank; + for (i = 0; i < size; i++) { + if (cur != size - 1) { + seg_count = l_seg_count; + } else { + seg_count = count - cur * l_seg_count; + } + /* At first iteration, copy local data to the solo data buffer */ + if (cur == rank) { + cur_win->w_osc_module->osc_fence(0, cur_win); + cur_win->w_osc_module->osc_put((char *) sbuf + + cur * l_seg_count * extent, + seg_count, dtype, cur, + (ptrdiff_t) data_bufs[cur], seg_count, dtype, cur_win); + cur_win->w_osc_module->osc_fence(0, cur_win); + } + /* For other iterations, do operations on the solo data buffer */ + else { + cur_win->w_osc_module->osc_accumulate((char *) sbuf + + cur * l_seg_count * + extent, seg_count, dtype, cur, (ptrdiff_t) + data_bufs[cur], seg_count, dtype, op, cur_win); + cur_win->w_osc_module->osc_fence(0, cur_win); + } + cur = (cur - 1 + size) % size; + *(int *) (solo_module->ctrl_bufs[rank]) = + (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size; + //solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win); + mac_coll_solo_barrier_intra(comm, module); + + } + /* At last, all the processes copies data from the solo data buffer */ + char *c; + c = rbuf; + for (i = 0; i < size; i++) { + if (i != size - 1) { + seg_count = l_seg_count; + } else { + seg_count = count - i * l_seg_count; + } + cur_win->w_osc_module->osc_get(c, seg_count, dtype, i, + (ptrdiff_t) data_bufs[i], seg_count, dtype, cur_win); + c = c + seg_count * extent; + } + cur_win->w_osc_module->osc_fence(0, cur_win); + if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + if (data_bufs != NULL) { + free(data_bufs); + data_bufs = NULL; + } + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs); + mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent); + } else { + //printf("TOO BIG\n"); + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/solo/coll_solo_barrier.c b/ompi/mca/coll/solo/coll_solo_barrier.c new file mode 100644 index 00000000000..26777e92acd --- /dev/null +++ b/ompi/mca/coll/solo/coll_solo_barrier.c @@ -0,0 +1,37 @@ +/** + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_solo.h" +int mac_coll_solo_barrier_intra(struct ompi_communicator_t *comm, mca_coll_base_module_t * module) +{ + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + + /* Enable solo module if necessary */ + if (!solo_module->enabled) { + mca_coll_solo_lazy_enable(module, comm); + } + + int rank = ompi_comm_rank(comm); + /* Atomic add to current ctrl_buf */ + char *barrier_ctrl_bufs = solo_module->ctrl_bufs[0] + opal_cache_line_size; + opal_atomic_add_fetch_32((opal_atomic_int32_t *) (barrier_ctrl_bufs + solo_module->barrier_tag * opal_cache_line_size), 1); + while (*((int32_t *) (barrier_ctrl_bufs + (solo_module->barrier_tag) * opal_cache_line_size)) != ompi_comm_size(comm)) { + opal_progress(); + } + + /* Set previous used ctrl_buf to 0 */ + if (rank == 0) { + *((int32_t *) (barrier_ctrl_bufs + ((solo_module->barrier_tag + 2) % 3) * opal_cache_line_size)) = 0; + } + /* Set barrier_tag to next ctrl_buf */ + solo_module->barrier_tag = (solo_module->barrier_tag + 1) % 3; + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/solo/coll_solo_bcast.c b/ompi/mca/coll/solo/coll_solo_bcast.c new file mode 100644 index 00000000000..e0482f609cf --- /dev/null +++ b/ompi/mca/coll/solo/coll_solo_bcast.c @@ -0,0 +1,147 @@ +/** + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_solo.h" + +int mca_coll_solo_bcast_intra(void *buff, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) { + mca_coll_solo_bcast_linear_intra_memcpy(buff, count, dtype, root, comm, module); + } + else { + mca_coll_solo_bcast_linear_intra_osc(buff, count, dtype, root, comm, module); + } + return OMPI_SUCCESS; +} + +int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + + int rank = ompi_comm_rank(comm); + int id; + ptrdiff_t extent, lower_bound; + ompi_datatype_get_extent(dtype, &lower_bound, &extent); + /* Enable solo module if necessary */ + if (!solo_module->enabled) { + mca_coll_solo_lazy_enable(module, comm); + } + char *data_buf; + if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + data_buf = solo_module->data_bufs[root]; + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + if (rank == root) { + id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); + } + mca_coll_solo_bcast_linear_intra_memcpy(&id, 1, MPI_INT, root, comm, module); + data_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, + count * extent); + } else { + /* TODO: Add support for very large messages */ + //printf("TOO BIG\n"); + } + + //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win); + if (rank == root) { + memcpy(data_buf, (char *) buff, count * extent); + } + //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win); + mac_coll_solo_barrier_intra(comm, module); + if (rank != root) { + memcpy((char *) buff, data_buf, count * extent); + } + //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win); + mac_coll_solo_barrier_intra(comm, module); + if ((size_t) count * extent > mca_coll_solo_component.static_block_size && + (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + if (rank == root) { + mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent); + } + } else { + /* TODO: Add support for very large messages */ + //printf("TOO BIG\n"); + } + return OMPI_SUCCESS; +} + +int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + + int rank = ompi_comm_rank(comm); + int id = 0; + char **attached_bufs = NULL; + MPI_Win cur_win; + ptrdiff_t extent, lower_bound; + ompi_datatype_get_extent(dtype, &lower_bound, &extent); + /* Enable solo module if necessary */ + if (!solo_module->enabled) { + mca_coll_solo_lazy_enable(module, comm); + } + char *data_buf; + if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + data_buf = (char *) 0 + 4 * opal_cache_line_size; + cur_win = solo_module->static_win; + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + if (rank == root) { + id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); + data_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, + count * extent); + attached_bufs = mca_coll_solo_attach_buf(solo_module, comm, data_buf, count * extent); + } else { + attached_bufs = mca_coll_solo_attach_buf(solo_module, comm, NULL, 0); + } + data_buf = attached_bufs[root]; + cur_win = solo_module->dynamic_win; + } else { + //printf("TOO BIG\n"); + } + + /* Root copy to shared memory */ + cur_win->w_osc_module->osc_fence(0, cur_win); + if (rank == root) { + cur_win->w_osc_module->osc_put(buff, count, dtype, root, (ptrdiff_t) data_buf, count, dtype, + cur_win); + } + cur_win->w_osc_module->osc_fence(0, cur_win); + /* Other processes copy data from shared memory */ + if (rank != root) { + cur_win->w_osc_module->osc_get(buff, count, dtype, root, (ptrdiff_t) data_buf, count, dtype, + cur_win); + } + cur_win->w_osc_module->osc_fence(0, cur_win); + + if ((size_t) count * extent > mca_coll_solo_component.static_block_size && + (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + if (rank == root) { + mca_coll_solo_detach_buf(solo_module, comm, data_buf, &attached_bufs); + mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent); + } else { + mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs); + } + } else { + //printf("TOO BIG\n"); + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/solo/coll_solo_component.c b/ompi/mca/coll/solo/coll_solo_component.c new file mode 100644 index 00000000000..34116ba5d85 --- /dev/null +++ b/ompi/mca/coll/solo/coll_solo_component.c @@ -0,0 +1,148 @@ +/** + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/util/show_help.h" +#include "ompi/constants.h" +#include "ompi/mca/coll/coll.h" +#include "coll_solo.h" + + +/** + * Public string showing the coll ompi_solo component version number + */ +const char *mca_coll_solo_component_version_string = + "Open MPI solo collective MCA component version " OMPI_VERSION; + +/** + * Local functions + */ +static int solo_close(void); +static int solo_register(void); + +/** + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ +mca_coll_solo_component_t mca_coll_solo_component = { + + /* First, fill in the super */ + + { + /* First, the mca_component_t struct containing meta + information about the component itself */ + .collm_version = { + MCA_COLL_BASE_VERSION_2_0_0, + + /* Component name and version */ + .mca_component_name = "solo", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, + OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + + /* Component functions */ + .mca_close_component = solo_close, + .mca_register_component_params = solo_register, + }, + .collm_data = { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE}, + + /* Initialization / querying functions */ + .collm_init_query = mca_coll_solo_init_query, + .collm_comm_query = mca_coll_solo_comm_query, + }, + + /* Shared-component specifc information */ + + /* (default) priority */ + 0, + /* (default) static_block_size */ + 4096, + /* (default) mpool_small_block_size */ + 1048576, + /* (default) mpool_small_block_num */ + 0, + /* (default) mpool_large_block_size */ + 8388608, + /* (default) mpool_large_block_num */ + 0, + /* (default) pointer to the shared mpool */ + NULL +}; + +/** + * Shut down the component + */ +static int solo_close(void) +{ + return OMPI_SUCCESS; +} + +/** + * Register MCA params + */ +static int solo_register(void) +{ + mca_base_component_t *c = &mca_coll_solo_component.super.collm_version; + mca_coll_solo_component_t *cs = &mca_coll_solo_component; + + /** + * If we want to be selected (i.e., all procs on one node), then we should have a high + * priority. + */ + cs->solo_priority = 0; + (void) mca_base_component_var_register(c, "priority", + "Priority of the solo coll component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, + 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->solo_priority); + + cs->static_block_size = 4096; + (void) mca_base_component_var_register(c, "static_block_size", + "static block size of the static window", + MCA_BASE_VAR_TYPE_UINT32_T, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->static_block_size); + + cs->mpool_small_block_size = 1048576; + (void) mca_base_component_var_register(c, "mpool_small_block_size", + "small block size of the mpool", + MCA_BASE_VAR_TYPE_UINT32_T, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->mpool_small_block_size); + + cs->mpool_small_block_num = 0; + (void) mca_base_component_var_register(c, "mpool_small_block_num", + "number of small blocks of the mpool", + MCA_BASE_VAR_TYPE_UINT32_T, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->mpool_small_block_num); + + cs->mpool_large_block_size = 8388608; + (void) mca_base_component_var_register(c, "mpool_large_block_size", + "large block size of the mpool", + MCA_BASE_VAR_TYPE_UINT32_T, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->mpool_large_block_size); + + cs->mpool_large_block_num = 0; + (void) mca_base_component_var_register(c, "mpool_large_block_num", + "number of large blocks of the mpool", + MCA_BASE_VAR_TYPE_UINT32_T, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->mpool_large_block_num); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/solo/coll_solo_module.c b/ompi/mca/coll/solo/coll_solo_module.c new file mode 100644 index 00000000000..49651a81acd --- /dev/null +++ b/ompi/mca/coll/solo/coll_solo_module.c @@ -0,0 +1,320 @@ +/** + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include +#include +#ifdef HAVE_SCHED_H +#include +#endif +#include +#ifdef HAVE_SYS_MMAN_H +#include +#endif /* HAVE_SYS_MMAN_H */ +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ + +#include "mpi.h" +#include "opal_stdint.h" +#include "opal/mca/hwloc/base/base.h" +#include "opal/util/os_path.h" + +#include "ompi/communicator/communicator.h" +#include "ompi/group/group.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/base.h" +#include "ompi/mca/rte/rte.h" +#include "ompi/proc/proc.h" +#include "coll_solo.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" + + +/** + * Local functions + */ +static int mca_coll_solo_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm); +static int mca_coll_solo_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm); + +/* solo module constructor */ +static void mca_coll_solo_module_construct(mca_coll_solo_module_t * module) +{ + module->enabled = false; + module->dynamic_win = NULL; + module->static_win = NULL; + module->ctrl_bufs = NULL; + module->data_bufs = NULL; + module->barrier_tag = 0; + module->super.coll_module_disable = mca_coll_solo_module_disable; +} + +/* solo module destructor */ +static void mca_coll_solo_module_destruct(mca_coll_solo_module_t * module) +{ + return; +} + +/* Disable solo module */ +static int mca_coll_solo_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) +{ + if (module->base_data != NULL) { + OBJ_RELEASE(module->base_data); + } + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + solo_module->enabled = false; + + /* If comm is MPI_COMM_WORLD, windows will be free at ompi_mpi_finalize.c:320 ompi_win_finalize() */ + // if (comm != MPI_COMM_WORLD) { + // int rank = ompi_comm_rank(comm); + + // /* Free the windows */ + // if (m->dynamic_win != NULL) { + // ompi_win_free(m->dynamic_win); + // } + // if (m->static_win != NULL) { + // ompi_win_free(m->static_win); + // } + // } + + if (solo_module->ctrl_bufs != NULL) { + free(solo_module->ctrl_bufs); + solo_module->ctrl_bufs = NULL; + } + + if (solo_module->data_bufs != NULL) { + free(solo_module->data_bufs); + solo_module->data_bufs = NULL; + } + + return OMPI_SUCCESS; +} + +OBJ_CLASS_INSTANCE(mca_coll_solo_module_t, + mca_coll_base_module_t, + mca_coll_solo_module_construct, mca_coll_solo_module_destruct); + +/** + * Initial query function that is invoked during MPI_INIT, allowing this component to disqualify + * itself if it doesn't support the required level of thread support. This function is invoked + * exactly once. + */ +int mca_coll_solo_init_query(bool enable_progress_threads, bool enable_mpi_threads) +{ + /* if no session directory was created, then we cannot be used */ + if (NULL == ompi_process_info.job_session_dir) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + /* Don't do much here because we don't really want to allocate any + shared memory until this component is selected to be used. */ + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:solo:init_query: pick me! pick me!"); + return OMPI_SUCCESS; +} + + +/** + * Invoked when there's a new communicator that has been created. + * Look at the communicator and decide which set of functions and + * priority we want to return. + */ +mca_coll_base_module_t *mca_coll_solo_comm_query(struct ompi_communicator_t * comm, int *priority) +{ + mca_coll_solo_module_t *solo_module; + + /** + * If we're intercomm, or if there's only one process in the communicator, or if not all the + * processes in the communicator are not on this node, then we don't want to run. + */ + if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) + || ompi_group_have_remote_peers(comm->c_local_group)) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:solo:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + + /* Get the priority level attached to this module. If priority is less + * than or equal to 0, then the module is unavailable. */ + *priority = mca_coll_solo_component.solo_priority; + if (0 >= mca_coll_solo_component.solo_priority) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:solo:comm_query (%d/%s): priority too low; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + + solo_module = OBJ_NEW(mca_coll_solo_module_t); + if (NULL == solo_module) { + return NULL; + } + + /* All is good -- return a module */ + solo_module->super.coll_module_enable = mca_coll_solo_module_enable; + solo_module->super.ft_event = NULL; + solo_module->super.coll_allgather = NULL; + solo_module->super.coll_allgatherv = NULL; + solo_module->super.coll_allreduce = mca_coll_solo_allreduce_intra; + solo_module->super.coll_alltoall = NULL; + solo_module->super.coll_alltoallv = NULL; + solo_module->super.coll_alltoallw = NULL; + solo_module->super.coll_barrier = mac_coll_solo_barrier_intra; + solo_module->super.coll_bcast = mca_coll_solo_bcast_intra; + solo_module->super.coll_exscan = NULL; + solo_module->super.coll_gather = NULL; + solo_module->super.coll_gatherv = NULL; + solo_module->super.coll_reduce = mca_coll_solo_reduce_intra; + solo_module->super.coll_reduce_scatter = NULL; + solo_module->super.coll_scan = NULL; + solo_module->super.coll_scatter = NULL; + solo_module->super.coll_scatterv = NULL; + + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:solo:comm_query (%d/%s): pick me! pick me!", + comm->c_contextid, comm->c_name); + return &(solo_module->super); +} + +/* Init the solo module on the communicator */ +static int mca_coll_solo_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) +{ + /* prepare the placeholder for the array of request for invoking base module */ + module->base_data = OBJ_NEW(mca_coll_base_comm_t); + if (NULL == module->base_data) { + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} + +/* Enable the solo module on the communicator lazily */ +int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm) +{ + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + + /** + * Temporarily use tuned module to prevent the collective operations in this module are invoked + * before the initialization. + */ + int var_id; + int tmp_priority = 100; + const int *origin_priority = NULL; + int tmp_origin = 0; + mca_base_var_find_by_name("coll_tuned_priority", &var_id); + mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); + tmp_origin = *origin_priority; + mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); + comm->c_coll->coll_allreduce = ompi_coll_base_allreduce_intra_recursivedoubling; + + /* Create the mpool */ + if (mca_coll_solo_component.solo_mpool == NULL) { + mca_coll_solo_component.solo_mpool = OBJ_NEW(mca_coll_solo_mpool_t); + } + + /* Create the dynamic_win */ + ompi_win_create_dynamic((opal_info_t *) (&ompi_mpi_info_null), comm, + &solo_module->dynamic_win); + + /* Create the static_win with shared memory allocation */ + mca_coll_solo_setup_static_win(solo_module, comm, + mca_coll_solo_component.static_block_size); + + solo_module->enabled = true; + + /* Set the functions and the priority back */ + comm->c_coll->coll_allreduce = mca_coll_solo_allreduce_intra; + mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); + return OMPI_SUCCESS; +} + +/** + * Attach a memory block to the dynamic_win of a communicator, returns an array contains the + * addresses of all the blocks of the processes in the communicator. + * local_buf == NULL and local_buf_size == 0 means there is no block to be attached on this process. + */ +char **mca_coll_solo_attach_buf(mca_coll_solo_module_t * solo_module, + struct ompi_communicator_t *comm, + char *local_buf, size_t local_buf_size) +{ + int rank = ompi_comm_rank(comm); + int size = ompi_comm_size(comm); + + char **attached_bufs = (char **) malloc(sizeof(char *) * size); + attached_bufs[rank] = local_buf; + ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0, + MPI_DATATYPE_NULL, + attached_bufs, + 1, MPI_AINT, comm, + (mca_coll_base_module_t *) solo_module); + + solo_module->dynamic_win->w_osc_module->osc_win_attach(solo_module->dynamic_win, local_buf, + local_buf_size); + + return attached_bufs; +} + +/* Detach a memory block from the dynamic_win of a communicator */ +void mca_coll_solo_detach_buf(mca_coll_solo_module_t * solo_module, + struct ompi_communicator_t *comm, + char *local_buf, char ***attached_bufs) +{ + if (local_buf != NULL) { + solo_module->dynamic_win->w_osc_module->osc_win_detach(solo_module->dynamic_win, local_buf); + } + + free(*attached_bufs); + *attached_bufs = NULL; + return; +} + +/* Setup and initialize the static_win of a communicator */ +void mca_coll_solo_setup_static_win(mca_coll_solo_module_t * solo_module, + struct ompi_communicator_t *comm, size_t data_buf_size) +{ + int i; + int rank = ompi_comm_rank(comm); + int size = ompi_comm_size(comm); + int *baseptr; + /* Create the static win */ + ompi_win_allocate_shared(4 * opal_cache_line_size + data_buf_size, + sizeof(char), + (opal_info_t *) (&ompi_mpi_info_null), comm, + &baseptr, &solo_module->static_win); + size_t static_size[size]; + int static_disp[size]; + solo_module->ctrl_bufs = (char **) malloc(sizeof(char *) * size); + solo_module->data_bufs = (char **) malloc(sizeof(char *) * size); + /** + * Get the shared memory address created with the static window, + * the first 4 * opal_cache_line_size is used for control messages, + * the rest is used for transfer very small messages. + */ + for (i = 0; i < size; i++) { + solo_module->static_win->w_osc_module->osc_win_shared_query(solo_module->static_win, i, + &(static_size[i]), + &(static_disp[i]), + &(solo_module->ctrl_bufs[i])); + solo_module->data_bufs[i] = (char *) (solo_module->ctrl_bufs[i]) + 4 * opal_cache_line_size; + } + /* Init ctrl_bufs with 0s */ + solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win); + for (i = 0; i < 4; i++) { + char *ptr = solo_module->ctrl_bufs[rank] + i * opal_cache_line_size; + *((int32_t *) ptr) = 0; + } + solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win); +} diff --git a/ompi/mca/coll/solo/coll_solo_mpool.c b/ompi/mca/coll/solo/coll_solo_mpool.c new file mode 100644 index 00000000000..7dc7479a2a1 --- /dev/null +++ b/ompi/mca/coll/solo/coll_solo_mpool.c @@ -0,0 +1,233 @@ +/** + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_solo.h" + +static void mca_coll_solo_queue_construct(mca_coll_solo_queue_t * queue); +static void mca_coll_solo_queue_destruct(mca_coll_solo_queue_t * queue); + +/* queue constructor */ +static void mca_coll_solo_queue_construct(mca_coll_solo_queue_t * queue) +{ + return; +} + +/* queue destructor */ +static void mca_coll_solo_queue_destruct(mca_coll_solo_queue_t * queue) +{ + return; +} + +OBJ_CLASS_INSTANCE(mca_coll_solo_queue_t, opal_object_t, mca_coll_solo_queue_construct, + mca_coll_solo_queue_destruct); + +/* Init the queue with node-wise communicator, number of blocks and size of each block. */ +void mca_coll_solo_queue_init(mca_coll_solo_queue_t * queue, ompi_communicator_t * node_comm, + int block_num, int block_size) +{ + int node_rank = ompi_comm_rank(node_comm); + queue->block_size = block_size; + queue->block_num = block_num; + int *temp_ptr; + int id_queue_size = opal_cache_line_size * (block_num + 3); + if (node_rank == 0) { + ompi_win_allocate_shared(block_size * block_num + id_queue_size, sizeof(char), + (opal_info_t *) (&ompi_mpi_info_null), + node_comm, &temp_ptr, &(queue->win)); + } else { + ompi_win_allocate_shared(0, sizeof(char), + (opal_info_t *) (&ompi_mpi_info_null), + node_comm, &temp_ptr, &(queue->win)); + } + size_t temp_size; + int temp_disp; + /* Get the address of the shared memory */ + queue->win->w_osc_module->osc_win_shared_query(queue->win, 0, &temp_size, &temp_disp, + &queue->blocks); + /* Set up the queue as shown in the coll_shared_mpool.h */ + queue->id_queue = queue->blocks + block_size * block_num; + queue->head = queue->id_queue + opal_cache_line_size * (block_num + 1); + queue->tail = queue->id_queue + opal_cache_line_size * (block_num + 2); + queue->win->w_osc_module->osc_fence(0, queue->win); + if (node_rank == 0) { + (*((mca_coll_solo_tag_t *) queue->head)).id = 0; + (*((mca_coll_solo_tag_t *) queue->head)).ref = 0; + *((COLL_SOLO_WORD *) queue->tail) = block_num; + int i; + for (i = 0; i < block_num + 1; i++) { + char *temp = queue->id_queue + opal_cache_line_size * i; + *((COLL_SOLO_WORD *) temp) = i + 1; + if (i == block_num) { + *((COLL_SOLO_WORD *) temp) = 0; + } + } + } + queue->win->w_osc_module->osc_fence(0, queue->win); + return; +} + +/* + * Request a block from the queue + */ +int mca_coll_solo_queue_request(mca_coll_solo_queue_t * queue) +{ + COLL_SOLO_DWORD cur_head, new_head; + COLL_SOLO_WORD cur_tail; + + do { + cur_head = *((COLL_SOLO_DWORD *) queue->head); + cur_tail = *((COLL_SOLO_WORD *) queue->tail); + if (((mca_coll_solo_tag_t *) &cur_head)->id == cur_tail) { + return -1; + } + new_head = cur_head; + ((mca_coll_solo_tag_t *) &new_head)->id = (((mca_coll_solo_tag_t *) &new_head)->id + 1) % + (queue->block_num + 1); + ((mca_coll_solo_tag_t *) &new_head)->ref = ((mca_coll_solo_tag_t *) &new_head)->ref + 1; + } while (!opal_atomic_compare_exchange_strong_64((COLL_SOLO_DWORD *) queue->head, + &cur_head, new_head)); + char *temp = queue->id_queue + opal_cache_line_size * ((mca_coll_solo_tag_t *) &cur_head)->id; + COLL_SOLO_WORD id = *((COLL_SOLO_WORD *) temp); + *((COLL_SOLO_WORD *) temp) = 0; + return id; +} + +/* + * Calculate block address based on block id + */ +char *mca_coll_solo_queue_calculate(mca_coll_solo_queue_t * queue, int id) +{ + return queue->blocks + queue->block_size * (id - 1); +} + +/* + * Return a block to the queue + */ +void mca_coll_solo_queue_return(mca_coll_solo_queue_t * queue, int id) +{ + COLL_SOLO_WORD cur_tail; + char *temp; + int32_t zero = 0; + do { + zero = 0; + cur_tail = *((COLL_SOLO_WORD *) queue->tail); + temp = queue->id_queue + opal_cache_line_size * cur_tail; + } while (!opal_atomic_compare_exchange_strong_32((COLL_SOLO_WORD *) temp, &zero, id)); + opal_atomic_compare_exchange_strong_32((COLL_SOLO_WORD *) queue->tail, &cur_tail, + (cur_tail + 1) % (queue->block_num + 1)); + return; +} + + +/* mpool classes */ +static void mca_coll_solo_mpool_construct(mca_coll_solo_mpool_t * mpool); +static void mca_coll_solo_mpool_destruct(mca_coll_solo_mpool_t * mpool); + +OBJ_CLASS_INSTANCE(mca_coll_solo_mpool_t, opal_object_t, mca_coll_solo_mpool_construct, + mca_coll_solo_mpool_destruct); + +/* mpool constructor */ +static void mca_coll_solo_mpool_construct(mca_coll_solo_mpool_t * mpool) +{ + /* Create the node_comm which contains all the processes on a node */ + ompi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, + (opal_info_t *) (&ompi_mpi_info_null), &(mpool->node_comm)); + int node_size = ompi_comm_size(mpool->node_comm); + /* Create the queues */ + mpool->small_queue = OBJ_NEW(mca_coll_solo_queue_t); + mpool->large_queue = OBJ_NEW(mca_coll_solo_queue_t); + /* verify the mca parameters */ + if (mca_coll_solo_component.mpool_small_block_size > + mca_coll_solo_component.mpool_large_block_size) { + uint32_t temp = mca_coll_solo_component.mpool_small_block_size; + mca_coll_solo_component.mpool_small_block_size = + mca_coll_solo_component.mpool_large_block_size; + mca_coll_solo_component.mpool_large_block_size = temp; + } + if (mca_coll_solo_component.mpool_small_block_num < (uint32_t) node_size) { + if (mca_coll_solo_component.mpool_small_block_num == 0) { + mca_coll_solo_component.mpool_small_block_num = node_size * 4; + } + else { + mca_coll_solo_component.mpool_small_block_num = node_size; + } + } + if (mca_coll_solo_component.mpool_large_block_num < (uint32_t) node_size) { + if (mca_coll_solo_component.mpool_large_block_num == 0) { + mca_coll_solo_component.mpool_large_block_num = node_size * 2; + } + else { + mca_coll_solo_component.mpool_large_block_num = node_size; + } + } + /* Init the queues */ + mca_coll_solo_queue_init(mpool->small_queue, mpool->node_comm, + mca_coll_solo_component.mpool_small_block_num, + mca_coll_solo_component.mpool_small_block_size); + mca_coll_solo_queue_init(mpool->large_queue, mpool->node_comm, + mca_coll_solo_component.mpool_large_block_num, + mca_coll_solo_component.mpool_large_block_size); + return; +} + +/* mpool destructor */ +static void mca_coll_solo_mpool_destruct(mca_coll_solo_mpool_t * mpool) +{ + OBJ_RELEASE(mpool->small_queue); + OBJ_RELEASE(mpool->large_queue); + return; +} + +/* Request block from the memory pool */ +int mca_coll_solo_mpool_request(mca_coll_solo_mpool_t * mpool, size_t len) +{ + if (len > mca_coll_solo_component.mpool_large_block_size) { + return -1; + } + int id = -1; + while (id == -1) { + if (len <= mca_coll_solo_component.mpool_small_block_size) { + id = mca_coll_solo_queue_request(mpool->small_queue); + } else { + id = mca_coll_solo_queue_request(mpool->large_queue); + } + } + return id; +} + +/* Calculate block address */ +char *mca_coll_solo_mpool_calculate(mca_coll_solo_mpool_t * mpool, int id, size_t len) +{ + if (id <= 0 || len > mca_coll_solo_component.mpool_large_block_size) { + return NULL; + } + char *addr; + if (len <= mca_coll_solo_component.mpool_small_block_size) { + addr = mca_coll_solo_queue_calculate(mpool->small_queue, id); + } else { + addr = mca_coll_solo_queue_calculate(mpool->large_queue, id); + } + return addr; +} + +/* Return block to memory pool */ +void mca_coll_solo_mpool_return(mca_coll_solo_mpool_t * mpool, int id, size_t len) +{ + if (len <= mca_coll_solo_component.mpool_small_block_size) { + mca_coll_solo_queue_return(mpool->small_queue, id); + } else if (len <= mca_coll_solo_component.mpool_large_block_size) { + mca_coll_solo_queue_return(mpool->large_queue, id); + } else { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:solo:mca_coll_solo_mpool_return: block size is wrong!"); + } + return; +} diff --git a/ompi/mca/coll/solo/coll_solo_mpool.h b/ompi/mca/coll/solo/coll_solo_mpool.h new file mode 100644 index 00000000000..695ebfb0dca --- /dev/null +++ b/ompi/mca/coll/solo/coll_solo_mpool.h @@ -0,0 +1,96 @@ +/** + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "opal/class/opal_object.h" +#include "opal/class/opal_hash_table.h" +#include "opal/class/opal_list.h" +#include "opal/threads/threads.h" + +#define COLL_SOLO_DWORD int64_t +#define COLL_SOLO_WORD int32_t +typedef struct { + /* the block id */ + COLL_SOLO_WORD id; + /* ref is added to resolve the potential ABA problem */ + COLL_SOLO_WORD ref; +} mca_coll_solo_tag_t; + +/** + * A lock-free array-based queue containing the blocks which can be accessed by any processes + * on the same node. + * An example of the queue is shown below (block_num is n, the size of each element in id queue, + * head and tail is opal_cache_line_size to avoid false sharing): + * Init: + * | blocks | id queue | head | tail | + * | block1 (avail) | block2 (avail) |...| blockn (avail) | 1 | 2 | 3 |...| n | 0 | 0/0 | n | + * Request a block - 0 in the id queue means it is not available: + * | block1 (using) | block2 (avail) |...| blockn (avail) | 0 | 2 | 3 |...| n | 0 | 1/1 | n | + * Request another block: + * | block1 (using) | block2 (using) |...| blockn (avail) | 0 | 0 | 3 |...| n | 0 | 2/2 | n | + * Return block 2: + * | block1 (using) | block2 (avail) |...| blockn (avail) | 0 | 0 | 3 |...| n | 2 | 2/2 | 0 | + */ +struct mca_coll_solo_queue_t { + /* the start address of blocks */ + char *blocks; + /* the number of blocks */ + int block_num; + /* the size of each block */ + size_t block_size; + /* the start address of id queue */ + char *id_queue; + /* the address of head */ + char *head; + /* the address of tail */ + char *tail; + /* a node-wise window */ + MPI_Win win; +}; + +typedef struct mca_coll_solo_queue_t mca_coll_solo_queue_t; + +OBJ_CLASS_DECLARATION(mca_coll_solo_queue_t); + +/* Init the queue */ +void mca_coll_solo_queue_init(mca_coll_solo_queue_t * queue, ompi_communicator_t * node_comm, + int block_num, int block_size); +/* Request a block from the queue, return a block id */ +int mca_coll_solo_queue_request(mca_coll_solo_queue_t * queue); +/* Calculate the block address with a block id */ +char *mca_coll_solo_queue_calculate(mca_coll_solo_queue_t * queue, int id); +/* Return a block to the queue */ +void mca_coll_solo_queue_return(mca_coll_solo_queue_t * queue, int id); + +/* Each node has a shared memory pool, which contains two queues of different block sizes.*/ +struct mca_coll_solo_mpool_t { + /* Generic parent class for all Open MPI objects */ + opal_object_t super; + /* An array-based queue contains small blocks */ + mca_coll_solo_queue_t *small_queue; + /* An array-based queue contains large blocks */ + mca_coll_solo_queue_t *large_queue; + /* A communicator contains all the processes on a node */ + ompi_communicator_t *node_comm; +}; + +typedef struct mca_coll_solo_mpool_t mca_coll_solo_mpool_t; + +OBJ_CLASS_DECLARATION(mca_coll_solo_mpool_t); + +/* Request block from memory pool */ +int mca_coll_solo_mpool_request(mca_coll_solo_mpool_t * mpool, size_t len); + +/* Calculate block address */ +char *mca_coll_solo_mpool_calculate(mca_coll_solo_mpool_t * mpool, int id, size_t len); + +/* Return block to memory pool */ +void mca_coll_solo_mpool_return(mca_coll_solo_mpool_t * mpool, int id, size_t len); diff --git a/ompi/mca/coll/solo/coll_solo_reduce.c b/ompi/mca/coll/solo/coll_solo_reduce.c new file mode 100644 index 00000000000..1e2cab430ef --- /dev/null +++ b/ompi/mca/coll/solo/coll_solo_reduce.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_solo.h" + +int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) { + mca_coll_solo_reduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, root, comm, module); + } + else { + mca_coll_solo_reduce_ring_intra_osc(sbuf, rbuf, count, dtype, op, root, comm, module); + + } + return OMPI_SUCCESS; + +} + +/** + * Each process operates a part of the shared data buffer in turn. + * Suppose the number of processes is 4. + * Step 1: + * | P0 | P1 | P2 | P3 | + * Step 2: + * | P1 | P2 | P3 | P0 | + * Step 3: + * | P2 | P3 | P0 | P1 | + * Step 4: + * | P3 | P0 | P1 | P2 | + * At last, root copies data back from the shared data buffer. + */ +int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, struct ompi_communicator_t + *comm, mca_coll_base_module_t * module) +{ + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + int size = ompi_comm_size(comm); + int rank = ompi_comm_rank(comm); + int i; + ptrdiff_t extent, lower_bound; + ompi_datatype_get_extent(dtype, &lower_bound, &extent); + + /* Enable solo module if necessary */ + if (!solo_module->enabled) { + mca_coll_solo_lazy_enable(module, comm); + } + + char **data_bufs = NULL; + int *ids = NULL; + if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + data_bufs = solo_module->data_bufs; + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + data_bufs = (char **) malloc(sizeof(char *) * size); + ids = (int *) malloc(sizeof(int) * size); + ids[rank] = + mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); + + ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0, + MPI_DATATYPE_NULL, + ids, + 1, MPI_INT, comm, + (mca_coll_base_module_t *) + solo_module); + for (i = 0; i < size; i++) { + data_bufs[i] = + mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i], + count * extent); + } + } else { + //printf("TOO BIG\n"); + } + + /* Set up segment count */ + int seg_count, l_seg_count; + seg_count = count / size; + l_seg_count = seg_count; + if (rank == size - 1) { + seg_count = count - rank * l_seg_count; + } + //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); + *(int *) (solo_module->ctrl_bufs[rank]) = rank; + //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); + mac_coll_solo_barrier_intra(comm, module); + + int cur = rank; + for (i = 0; i < size; i++) { + if (cur != size - 1) { + seg_count = l_seg_count; + } else { + seg_count = count - cur * l_seg_count; + } + /* At first iteration, copy local data to the solo data buffer */ + if (cur == rank) { + //cur_win->w_osc_module->osc_fence(0, cur_win); + memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent); + //cur_win->w_osc_module->osc_fence(0, cur_win); + mac_coll_solo_barrier_intra(comm, module); + + } + /* For other iterations, do operations on the solo data buffer */ + else { + ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent, + data_bufs[cur], seg_count, dtype); + //cur_win->w_osc_modulbe->osc_fence(0,cur_win); + mac_coll_solo_barrier_intra(comm, module); + } + cur = (cur - 1 + size) % size; + *(int *) (solo_module->ctrl_bufs[rank]) = + (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size; + //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); + mac_coll_solo_barrier_intra(comm, module); + + } + /* At last, root copies data from the solo data buffer */ + if (rank == root) { + char *c; + c = rbuf; + for (i = 0; i < size; i++) { + if (i != size - 1) { + seg_count = l_seg_count; + } else { + seg_count = count - i * l_seg_count; + } + memcpy((char *) c, data_bufs[i], seg_count * extent); + c = c + seg_count * extent; + } + } + //cur_win->w_osc_module->osc_fence(0, cur_win); + mac_coll_solo_barrier_intra(comm, module); + if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + ; + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[rank], + count * extent); + if (ids != NULL) { + free(ids); + ids = NULL; + } + + if (data_bufs != NULL) { + free(data_bufs); + data_bufs = NULL; + } + } else { + //printf("TOO BIG\n"); + } + + + return OMPI_SUCCESS; +} + +int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + int size = ompi_comm_size(comm); + int rank = ompi_comm_rank(comm); + int i; + ptrdiff_t extent, lower_bound; + ompi_datatype_get_extent(dtype, &lower_bound, &extent); + + /* Enable solo module if necessary */ + if (!solo_module->enabled) { + mca_coll_solo_lazy_enable(module, comm); + } + char **data_bufs = NULL; + int id; + MPI_Win cur_win; + char *local_buf = NULL; + if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + data_bufs = (char **) malloc(sizeof(char *) * size); + for (i = 0; i < size; i++) { + data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size; + } + cur_win = solo_module->static_win; + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); + local_buf = + mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, + count * extent); + data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent); + cur_win = solo_module->dynamic_win; + } else { + //printf("TOO BIG\n"); + } + + /* Set up segment count */ + int seg_count, l_seg_count; + seg_count = count / size; + l_seg_count = seg_count; + if (rank == size - 1) { + seg_count = count - rank * l_seg_count; + } + //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); + *(int *) (solo_module->ctrl_bufs[rank]) = rank; + //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); + mac_coll_solo_barrier_intra(comm, module); + + int cur = rank; + for (i = 0; i < size; i++) { + if (cur != size - 1) { + seg_count = l_seg_count; + } else { + seg_count = count - cur * l_seg_count; + } + /* At first iteration, copy local data to the solo data buffer */ + if (cur == rank) { + cur_win->w_osc_module->osc_fence(0, cur_win); + cur_win->w_osc_module->osc_put((char *) sbuf + + cur * l_seg_count * extent, + seg_count, dtype, cur, + (ptrdiff_t) data_bufs[cur], seg_count, dtype, cur_win); + cur_win->w_osc_module->osc_fence(0, cur_win); + } + /* For other iterations, do operations on the solo data buffer */ + else { + cur_win->w_osc_module->osc_accumulate((char *) sbuf + + cur * l_seg_count * + extent, seg_count, dtype, cur, (ptrdiff_t) + data_bufs[cur], seg_count, dtype, op, cur_win); + cur_win->w_osc_module->osc_fence(0, cur_win); + } + cur = (cur - 1 + size) % size; + *(int *) (solo_module->ctrl_bufs[rank]) = + (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size; + //solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win); + mac_coll_solo_barrier_intra(comm, module); + + } + /* At last, root copies data from the solo data buffer */ + if (rank == root) { + char *c; + c = rbuf; + for (i = 0; i < size; i++) { + if (i != size - 1) { + seg_count = l_seg_count; + } else { + seg_count = count - i * l_seg_count; + } + cur_win->w_osc_module->osc_get(c, seg_count, dtype, i, (ptrdiff_t) + data_bufs[i], seg_count, dtype, cur_win); + c = c + seg_count * extent; + } + } + cur_win->w_osc_module->osc_fence(0, cur_win); + if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + if (data_bufs != NULL) { + free(data_bufs); + data_bufs = NULL; + } + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs); + mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent); + } else { + //printf("TOO BIG\n"); + } + + return OMPI_SUCCESS; +} From d7f237924ff8149ec44ed1ca299ed7154ae45bfb Mon Sep 17 00:00:00 2001 From: Xi Luo Date: Thu, 17 Oct 2019 14:40:22 -0400 Subject: [PATCH 2/7] Add a pipeline algorithm of broadcast to support very large message. --- ompi/mca/coll/solo/coll_solo.h | 14 ++ ompi/mca/coll/solo/coll_solo_bcast.c | 208 +++++++++++++++++++++++--- ompi/mca/coll/solo/coll_solo_module.c | 2 +- 3 files changed, 206 insertions(+), 18 deletions(-) diff --git a/ompi/mca/coll/solo/coll_solo.h b/ompi/mca/coll/solo/coll_solo.h index 4d64c63cde0..83c7e040d02 100644 --- a/ompi/mca/coll/solo/coll_solo.h +++ b/ompi/mca/coll/solo/coll_solo.h @@ -141,6 +141,20 @@ int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module, + size_t seg_size); + +int mca_coll_solo_bcast_pipeline_intra_osc(void *buff, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module, + size_t seg_size); + /* MPI_Reduce algorithms */ int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, diff --git a/ompi/mca/coll/solo/coll_solo_bcast.c b/ompi/mca/coll/solo/coll_solo_bcast.c index e0482f609cf..bcddb724d56 100644 --- a/ompi/mca/coll/solo/coll_solo_bcast.c +++ b/ompi/mca/coll/solo/coll_solo_bcast.c @@ -26,6 +26,7 @@ int mca_coll_solo_bcast_intra(void *buff, int count, return OMPI_SUCCESS; } +/* linear bcast with memcpy */ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count, struct ompi_datatype_t *dtype, int root, @@ -35,17 +36,18 @@ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count, mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; int rank = ompi_comm_rank(comm); - int id; ptrdiff_t extent, lower_bound; ompi_datatype_get_extent(dtype, &lower_bound, &extent); /* Enable solo module if necessary */ if (!solo_module->enabled) { mca_coll_solo_lazy_enable(module, comm); } + /* Init the data_buf - shared among all the processes */ + int id; char *data_buf; if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { data_buf = solo_module->data_bufs[root]; - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_small_block_size) { if (rank == root) { id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); } @@ -53,29 +55,25 @@ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count, data_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, count * extent); } else { - /* TODO: Add support for very large messages */ - //printf("TOO BIG\n"); + return mca_coll_solo_bcast_pipeline_intra_memcpy(buff, count, dtype, root, comm, module, + mca_coll_solo_component.mpool_small_block_size); } - //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win); + /* Root copy data to the shared memory block */ if (rank == root) { memcpy(data_buf, (char *) buff, count * extent); } - //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win); mac_coll_solo_barrier_intra(comm, module); + /* Other processes copy data from the shared memory block */ if (rank != root) { memcpy((char *) buff, data_buf, count * extent); } - //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win); mac_coll_solo_barrier_intra(comm, module); if ((size_t) count * extent > mca_coll_solo_component.static_block_size && (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { if (rank == root) { mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent); } - } else { - /* TODO: Add support for very large messages */ - //printf("TOO BIG\n"); } return OMPI_SUCCESS; } @@ -89,20 +87,21 @@ int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count, mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; int rank = ompi_comm_rank(comm); - int id = 0; - char **attached_bufs = NULL; - MPI_Win cur_win; ptrdiff_t extent, lower_bound; ompi_datatype_get_extent(dtype, &lower_bound, &extent); /* Enable solo module if necessary */ if (!solo_module->enabled) { mca_coll_solo_lazy_enable(module, comm); } + /* Init the data_buf - shared among all the processes */ + int id = 0; + char **attached_bufs = NULL; + MPI_Win cur_win; char *data_buf; if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { data_buf = (char *) 0 + 4 * opal_cache_line_size; cur_win = solo_module->static_win; - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_small_block_size) { if (rank == root) { id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); data_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, @@ -114,7 +113,8 @@ int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count, data_buf = attached_bufs[root]; cur_win = solo_module->dynamic_win; } else { - //printf("TOO BIG\n"); + return mca_coll_solo_bcast_pipeline_intra_osc(buff, count, dtype, root, comm, module, + mca_coll_solo_component.mpool_small_block_size); } /* Root copy to shared memory */ @@ -139,9 +139,183 @@ int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count, } else { mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs); } - } else { - //printf("TOO BIG\n"); + } + + return OMPI_SUCCESS; +} + +int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module, + size_t seg_size) +{ + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + + int rank = ompi_comm_rank(comm); + ptrdiff_t extent, lower_bound; + ompi_datatype_get_extent(dtype, &lower_bound, &extent); + /* Enable solo module if necessary */ + if (!solo_module->enabled) { + mca_coll_solo_lazy_enable(module, comm); + } + /* Init the data_bufs - shared among all the processes, needs two for the pipelining */ + int ids[2]; + char *data_bufs[2]; + int i; + for (i = 0; i < 2; i++) { + if (rank == root) { + ids[i] = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, seg_size); + } + } + mca_coll_solo_bcast_linear_intra_memcpy(ids, 2, MPI_INT, root, comm, module); + for (i = 0; i < 2; i++) { + data_bufs[i] = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i], + seg_size); + } + + int seg_count = count; + size_t typelng; + ompi_datatype_type_size(dtype, &typelng); + COLL_BASE_COMPUTED_SEGCOUNT(seg_size, typelng, seg_count); + int num_segments = (count + seg_count - 1) / seg_count; + int last_count = count - seg_count * (num_segments - 1); + + for (i = 0; i <= num_segments; i++) { + int cur = i & 1; + int pre = !cur; + if (i == 0) { + /* In the first iteration, root copies data to the current shared memory block */ + if (rank == root) { + memcpy(data_bufs[cur], (char *) buff, seg_count * extent); + } + } + else if ( i == num_segments) { + /* In the last iteration, other processes copy data from the previous shared memory block */ + memcpy(((char *) buff) + seg_count * extent * (i - 1), data_bufs[pre], last_count * extent); + } + else { + /** + * For other iterations, root copies data to the current shared memory block and + * other proceeses copy data from the previous shared memory block. + */ + if (rank == root) { + int temp_count = seg_count; + if ( i == num_segments - 1) { + temp_count = last_count; + } + memcpy(data_bufs[cur], ((char *) buff) + seg_count * extent * i, temp_count * extent); + } + else { + memcpy(((char *) buff) + seg_count * extent * (i - 1), data_bufs[pre], seg_count * extent); + } + } + mac_coll_solo_barrier_intra(comm, module); } + /* Return the data_bufs */ + for (i = 0; i < 2; i++) { + if (rank == root) { + mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[i], seg_size); + } + } + + return OMPI_SUCCESS; +} + +int mca_coll_solo_bcast_pipeline_intra_osc(void *buff, int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module, + size_t seg_size) +{ + mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; + + int rank = ompi_comm_rank(comm); + ptrdiff_t extent, lower_bound; + ompi_datatype_get_extent(dtype, &lower_bound, &extent); + /* Enable solo module if necessary */ + if (!solo_module->enabled) { + mca_coll_solo_lazy_enable(module, comm); + } + /* Init the data_bufs - shared among all the processes, needs two for the pipelining */ + int ids[2]; + char **attached_bufs[2]; + MPI_Win cur_win = solo_module->dynamic_win; + char *data_bufs[2]; + int i; + for (i = 0; i < 2; i++) { + if (rank == root) { + ids[i] = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, seg_size); + data_bufs[i] = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i], + seg_size); + attached_bufs[i] = mca_coll_solo_attach_buf(solo_module, comm, data_bufs[i], seg_size); + } + else { + attached_bufs[i] = mca_coll_solo_attach_buf(solo_module, comm, NULL, 0); + } + data_bufs[i] = attached_bufs[i][root]; + } + + int seg_count = count; + size_t typelng; + ompi_datatype_type_size(dtype, &typelng); + COLL_BASE_COMPUTED_SEGCOUNT(seg_size, typelng, seg_count); + int num_segments = (count + seg_count - 1) / seg_count; + int last_count = count - seg_count * (num_segments - 1); + + cur_win->w_osc_module->osc_fence(0, cur_win); + for (i = 0; i <= num_segments; i++) { + int cur = i & 1; + int pre = !cur; + if (i == 0) { + /* In the first iteration, root copies data to the current shared memory block */ + if (rank == root) { + cur_win->w_osc_module->osc_put(buff, seg_count, dtype, root, (ptrdiff_t) data_bufs[cur], + seg_count, dtype, cur_win); + } + } + else if ( i == num_segments) { + /* In the last iteration, other processes copy data from the previous shared memory block */ + cur_win->w_osc_module->osc_get(((char *) buff) + seg_count * extent * (i - 1), + last_count, dtype, root, (ptrdiff_t) data_bufs[pre], + last_count, dtype, cur_win); + } + else { + /** + * For other iterations, root copies data to the current shared memory block and + * other proceeses copy data from the previous shared memory block. + */ + if (rank == root) { + int temp_count = seg_count; + if ( i == num_segments - 1) { + temp_count = last_count; + } + cur_win->w_osc_module->osc_put(((char *) buff) + seg_count * extent * i, + temp_count, dtype, root, (ptrdiff_t) data_bufs[cur], + temp_count, dtype, cur_win); + } + else { + cur_win->w_osc_module->osc_get(((char *) buff) + seg_count * extent * (i - 1), + seg_count, dtype, root, (ptrdiff_t) data_bufs[pre], + seg_count, dtype, cur_win); + } + } + cur_win->w_osc_module->osc_fence(0, cur_win); + } + + /* Return the data_bufs */ + for (i = 0; i < 2; i++) { + if (rank == root) { + mca_coll_solo_detach_buf(solo_module, comm, data_bufs[i], &attached_bufs[i]); + mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[i], seg_size); + } + else { + mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs[i]); + } + } + return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/solo/coll_solo_module.c b/ompi/mca/coll/solo/coll_solo_module.c index 49651a81acd..0b59625a0c1 100644 --- a/ompi/mca/coll/solo/coll_solo_module.c +++ b/ompi/mca/coll/solo/coll_solo_module.c @@ -231,7 +231,7 @@ int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_commu /* Create the static_win with shared memory allocation */ mca_coll_solo_setup_static_win(solo_module, comm, - mca_coll_solo_component.static_block_size); + mca_coll_solo_component.static_block_size); solo_module->enabled = true; From 145589c81894132f27a34056f6def7122d81a7bd Mon Sep 17 00:00:00 2001 From: Xi Luo Date: Thu, 17 Oct 2019 20:39:55 -0400 Subject: [PATCH 3/7] Support very large message for reduce and allreduce. --- ompi/mca/coll/solo/coll_solo_allreduce.c | 47 +++++++++++----- ompi/mca/coll/solo/coll_solo_bcast.c | 2 +- ompi/mca/coll/solo/coll_solo_reduce.c | 68 +++++++++++++++--------- 3 files changed, 77 insertions(+), 40 deletions(-) diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c index 383d28b66d6..4eb4b31d6d2 100644 --- a/ompi/mca/coll/solo/coll_solo_allreduce.c +++ b/ompi/mca/coll/solo/coll_solo_allreduce.c @@ -82,7 +82,23 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int count * extent); } } else { - //printf("TOO BIG\n"); + int seg_count = count; + size_t typelng; + ompi_datatype_type_size(dtype, &typelng); + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count); + int num_segments = (count + seg_count - 1) / seg_count; + int last_count = count - seg_count * (num_segments - 1); + for (int i = 0; i < num_segments; i++) { + char *temp_sbuf = (char *)sbuf + seg_count * extent * i; + char *temp_rbuf = (char *)rbuf + seg_count * extent * i; + int temp_count = seg_count; + if (i == num_segments - 1) { + temp_count = last_count; + } + mca_coll_solo_allreduce_ring_intra_memcpy(temp_sbuf, temp_rbuf, temp_count, dtype, op, + comm, module); + } + return MPI_SUCCESS; } /* Set up segment count */ @@ -92,9 +108,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int if (rank == size - 1) { seg_count = count - rank * l_seg_count; } - //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); *(int *) (solo_module->ctrl_bufs[rank]) = rank; - //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); mac_coll_solo_barrier_intra(comm, module); int cur = rank; @@ -106,9 +120,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int } /* At first iteration, copy local data to the solo data buffer */ if (cur == rank) { - //cur_win->w_osc_module->osc_fence(0, cur_win); memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent); - //cur_win->w_osc_module->osc_fence(0, cur_win); mac_coll_solo_barrier_intra(comm, module); } @@ -116,13 +128,11 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int else { ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent, data_bufs[cur], seg_count, dtype); - //cur_win->w_osc_module->osc_fence(0,cur_win); mac_coll_solo_barrier_intra(comm, module); } cur = (cur - 1 + size) % size; *(int *) (solo_module->ctrl_bufs[rank]) = (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size; - //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); mac_coll_solo_barrier_intra(comm, module); } @@ -138,7 +148,6 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int memcpy((char *) c, data_bufs[i], seg_count * extent); c = c + seg_count * extent; } - //cur_win->w_osc_module->osc_fence(0, cur_win); mac_coll_solo_barrier_intra(comm, module); if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { ; @@ -155,11 +164,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int data_bufs = NULL; } - } else { - //printf("TOO BIG\n"); } - - return OMPI_SUCCESS; } @@ -198,7 +203,23 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent); cur_win = solo_module->dynamic_win; } else { - //printf("TOO BIG\n"); + int seg_count = count; + size_t typelng; + ompi_datatype_type_size(dtype, &typelng); + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count); + int num_segments = (count + seg_count - 1) / seg_count; + int last_count = count - seg_count * (num_segments - 1); + for (int i = 0; i < num_segments; i++) { + char *temp_sbuf = (char *)sbuf + seg_count * extent * i; + char *temp_rbuf = (char *)rbuf + seg_count * extent * i; + int temp_count = seg_count; + if (i == num_segments - 1) { + temp_count = last_count; + } + mca_coll_solo_allreduce_ring_intra_osc(temp_sbuf, temp_rbuf, temp_count, dtype, op, + comm, module); + } + return MPI_SUCCESS; } /* Set up segment count */ diff --git a/ompi/mca/coll/solo/coll_solo_bcast.c b/ompi/mca/coll/solo/coll_solo_bcast.c index bcddb724d56..88d1cc12ee4 100644 --- a/ompi/mca/coll/solo/coll_solo_bcast.c +++ b/ompi/mca/coll/solo/coll_solo_bcast.c @@ -139,7 +139,7 @@ int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count, } else { mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs); } - } + } return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/solo/coll_solo_reduce.c b/ompi/mca/coll/solo/coll_solo_reduce.c index 1e2cab430ef..336ad845c57 100644 --- a/ompi/mca/coll/solo/coll_solo_reduce.c +++ b/ompi/mca/coll/solo/coll_solo_reduce.c @@ -43,11 +43,12 @@ int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count, * At last, root copies data back from the shared data buffer. */ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, struct ompi_communicator_t - *comm, mca_coll_base_module_t * module) + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) { mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; int size = ompi_comm_size(comm); @@ -83,7 +84,23 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, count * extent); } } else { - //printf("TOO BIG\n"); + int seg_count = count; + size_t typelng; + ompi_datatype_type_size(dtype, &typelng); + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count); + int num_segments = (count + seg_count - 1) / seg_count; + int last_count = count - seg_count * (num_segments - 1); + for (int i = 0; i < num_segments; i++) { + char *temp_sbuf = (char *)sbuf + seg_count * extent * i; + char *temp_rbuf = (char *)rbuf + seg_count * extent * i; + int temp_count = seg_count; + if (i == num_segments - 1) { + temp_count = last_count; + } + mca_coll_solo_reduce_ring_intra_memcpy(temp_sbuf, temp_rbuf, temp_count, dtype, op, + root, comm, module); + } + return MPI_SUCCESS; } /* Set up segment count */ @@ -93,9 +110,7 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, if (rank == size - 1) { seg_count = count - rank * l_seg_count; } - //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); *(int *) (solo_module->ctrl_bufs[rank]) = rank; - //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); mac_coll_solo_barrier_intra(comm, module); int cur = rank; @@ -107,9 +122,7 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, } /* At first iteration, copy local data to the solo data buffer */ if (cur == rank) { - //cur_win->w_osc_module->osc_fence(0, cur_win); memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent); - //cur_win->w_osc_module->osc_fence(0, cur_win); mac_coll_solo_barrier_intra(comm, module); } @@ -117,13 +130,11 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, else { ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent, data_bufs[cur], seg_count, dtype); - //cur_win->w_osc_modulbe->osc_fence(0,cur_win); mac_coll_solo_barrier_intra(comm, module); } cur = (cur - 1 + size) % size; *(int *) (solo_module->ctrl_bufs[rank]) = (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size; - //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); mac_coll_solo_barrier_intra(comm, module); } @@ -141,11 +152,9 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, c = c + seg_count * extent; } } - //cur_win->w_osc_module->osc_fence(0, cur_win); mac_coll_solo_barrier_intra(comm, module); - if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { - ; - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + if ((size_t) count * extent > mca_coll_solo_component.static_block_size && + (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[rank], count * extent); if (ids != NULL) { @@ -157,11 +166,8 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, free(data_bufs); data_bufs = NULL; } - } else { - //printf("TOO BIG\n"); } - return OMPI_SUCCESS; } @@ -201,7 +207,23 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent); cur_win = solo_module->dynamic_win; } else { - //printf("TOO BIG\n"); + int seg_count = count; + size_t typelng; + ompi_datatype_type_size(dtype, &typelng); + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count); + int num_segments = (count + seg_count - 1) / seg_count; + int last_count = count - seg_count * (num_segments - 1); + for (int i = 0; i < num_segments; i++) { + char *temp_sbuf = (char *)sbuf + seg_count * extent * i; + char *temp_rbuf = (char *)rbuf + seg_count * extent * i; + int temp_count = seg_count; + if (i == num_segments - 1) { + temp_count = last_count; + } + mca_coll_solo_reduce_ring_intra_osc(temp_sbuf, temp_rbuf, temp_count, dtype, op, + root, comm, module); + } + return MPI_SUCCESS; } /* Set up segment count */ @@ -211,9 +233,7 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, if (rank == size - 1) { seg_count = count - rank * l_seg_count; } - //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); *(int *) (solo_module->ctrl_bufs[rank]) = rank; - //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); mac_coll_solo_barrier_intra(comm, module); int cur = rank; @@ -243,7 +263,6 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, cur = (cur - 1 + size) % size; *(int *) (solo_module->ctrl_bufs[rank]) = (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size; - //solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win); mac_coll_solo_barrier_intra(comm, module); } @@ -271,9 +290,6 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs); mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent); - } else { - //printf("TOO BIG\n"); } - return OMPI_SUCCESS; } From a7d96d9251fa647b7e01ad80b900353d3ef7e5ce Mon Sep 17 00:00:00 2001 From: Xi Luo Date: Thu, 17 Oct 2019 20:48:27 -0400 Subject: [PATCH 4/7] Fix comments in coll_solo_allreduce.c --- ompi/mca/coll/solo/coll_solo_allreduce.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c index 4eb4b31d6d2..7a3d7bb404e 100644 --- a/ompi/mca/coll/solo/coll_solo_allreduce.c +++ b/ompi/mca/coll/solo/coll_solo_allreduce.c @@ -229,9 +229,7 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou if (rank == size - 1) { seg_count = count - rank * l_seg_count; } - //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); *(int *) (solo_module->ctrl_bufs[rank]) = rank; - //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win); mac_coll_solo_barrier_intra(comm, module); int cur = rank; @@ -261,7 +259,6 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou cur = (cur - 1 + size) % size; *(int *) (solo_module->ctrl_bufs[rank]) = (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size; - //solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win); mac_coll_solo_barrier_intra(comm, module); } @@ -287,9 +284,6 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs); mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent); - } else { - //printf("TOO BIG\n"); - } - + } return OMPI_SUCCESS; } From b0ca414a22cef7178b43345dc69c4ab7364563ee Mon Sep 17 00:00:00 2001 From: Xi Luo Date: Mon, 21 Oct 2019 14:23:51 -0400 Subject: [PATCH 5/7] Fix the memory allocation problem in reduce and allreduce. (should allocate l_seg_count instead of count) --- ompi/mca/coll/solo/coll_solo_allreduce.c | 64 +++++++++++++----------- ompi/mca/coll/solo/coll_solo_reduce.c | 63 ++++++++++++----------- 2 files changed, 68 insertions(+), 59 deletions(-) diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c index 7a3d7bb404e..7637b1cad29 100644 --- a/ompi/mca/coll/solo/coll_solo_allreduce.c +++ b/ompi/mca/coll/solo/coll_solo_allreduce.c @@ -60,15 +60,23 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int mca_coll_solo_lazy_enable(module, comm); } + /* Set up segment count */ + int seg_count, l_seg_count; + seg_count = count / size; + l_seg_count = seg_count; + if (rank == size - 1) { + seg_count = count - rank * l_seg_count; + } + char **data_bufs = NULL; int *ids = NULL; - if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) { data_bufs = solo_module->data_bufs; - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { data_bufs = (char **) malloc(sizeof(char *) * size); ids = (int *) malloc(sizeof(int) * size); ids[rank] = - mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); + mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent); ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, @@ -79,9 +87,10 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int for (i = 0; i < size; i++) { data_bufs[i] = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i], - count * extent); + l_seg_count * extent); } } else { + /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */ int seg_count = count; size_t typelng; ompi_datatype_type_size(dtype, &typelng); @@ -101,13 +110,6 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int return MPI_SUCCESS; } - /* Set up segment count */ - int seg_count, l_seg_count; - seg_count = count / size; - l_seg_count = seg_count; - if (rank == size - 1) { - seg_count = count - rank * l_seg_count; - } *(int *) (solo_module->ctrl_bufs[rank]) = rank; mac_coll_solo_barrier_intra(comm, module); @@ -149,11 +151,10 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int c = c + seg_count * extent; } mac_coll_solo_barrier_intra(comm, module); - if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { - ; - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + if ((size_t) l_seg_count * extent > mca_coll_solo_component.static_block_size && + (size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[rank], - count * extent); + l_seg_count * extent); if (ids != NULL) { free(ids); ids = NULL; @@ -185,24 +186,34 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou if (!solo_module->enabled) { mca_coll_solo_lazy_enable(module, comm); } + + /* Set up segment count */ + int seg_count, l_seg_count; + seg_count = count / size; + l_seg_count = seg_count; + if (rank == size - 1) { + seg_count = count - rank * l_seg_count; + } + char **data_bufs = NULL; int id; MPI_Win cur_win; char *local_buf = NULL; - if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) { data_bufs = (char **) malloc(sizeof(char *) * size); for (i = 0; i < size; i++) { data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size; } cur_win = solo_module->static_win; - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { - id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); + } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { + id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent); local_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, - count * extent); - data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent); + l_seg_count * extent); + data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, l_seg_count * extent); cur_win = solo_module->dynamic_win; } else { + /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */ int seg_count = count; size_t typelng; ompi_datatype_type_size(dtype, &typelng); @@ -222,13 +233,6 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou return MPI_SUCCESS; } - /* Set up segment count */ - int seg_count, l_seg_count; - seg_count = count / size; - l_seg_count = seg_count; - if (rank == size - 1) { - seg_count = count - rank * l_seg_count; - } *(int *) (solo_module->ctrl_bufs[rank]) = rank; mac_coll_solo_barrier_intra(comm, module); @@ -276,14 +280,14 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou c = c + seg_count * extent; } cur_win->w_osc_module->osc_fence(0, cur_win); - if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) { if (data_bufs != NULL) { free(data_bufs); data_bufs = NULL; } - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs); - mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent); + mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, l_seg_count * extent); } return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/solo/coll_solo_reduce.c b/ompi/mca/coll/solo/coll_solo_reduce.c index 336ad845c57..5ef92c960ba 100644 --- a/ompi/mca/coll/solo/coll_solo_reduce.c +++ b/ompi/mca/coll/solo/coll_solo_reduce.c @@ -57,6 +57,14 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, ptrdiff_t extent, lower_bound; ompi_datatype_get_extent(dtype, &lower_bound, &extent); + /* Set up segment count */ + int seg_count, l_seg_count; + seg_count = count / size; + l_seg_count = seg_count; + if (rank == size - 1) { + seg_count = count - rank * l_seg_count; + } + /* Enable solo module if necessary */ if (!solo_module->enabled) { mca_coll_solo_lazy_enable(module, comm); @@ -64,13 +72,13 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, char **data_bufs = NULL; int *ids = NULL; - if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) { data_bufs = solo_module->data_bufs; - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { data_bufs = (char **) malloc(sizeof(char *) * size); ids = (int *) malloc(sizeof(int) * size); ids[rank] = - mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); + mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent); ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, @@ -81,9 +89,10 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, for (i = 0; i < size; i++) { data_bufs[i] = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i], - count * extent); + l_seg_count * extent); } } else { + /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */ int seg_count = count; size_t typelng; ompi_datatype_type_size(dtype, &typelng); @@ -103,13 +112,6 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, return MPI_SUCCESS; } - /* Set up segment count */ - int seg_count, l_seg_count; - seg_count = count / size; - l_seg_count = seg_count; - if (rank == size - 1) { - seg_count = count - rank * l_seg_count; - } *(int *) (solo_module->ctrl_bufs[rank]) = rank; mac_coll_solo_barrier_intra(comm, module); @@ -153,10 +155,10 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, } } mac_coll_solo_barrier_intra(comm, module); - if ((size_t) count * extent > mca_coll_solo_component.static_block_size && - (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + if ((size_t) l_seg_count * extent > mca_coll_solo_component.static_block_size && + (size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[rank], - count * extent); + l_seg_count * extent); if (ids != NULL) { free(ids); ids = NULL; @@ -189,22 +191,31 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, if (!solo_module->enabled) { mca_coll_solo_lazy_enable(module, comm); } + + /* Set up segment count */ + int seg_count, l_seg_count; + seg_count = count / size; + l_seg_count = seg_count; + if (rank == size - 1) { + seg_count = count - rank * l_seg_count; + } + char **data_bufs = NULL; int id; MPI_Win cur_win; char *local_buf = NULL; - if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) { data_bufs = (char **) malloc(sizeof(char *) * size); for (i = 0; i < size; i++) { data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size; } cur_win = solo_module->static_win; - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { - id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); + } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { + id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent); local_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, - count * extent); - data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent); + l_seg_count * extent); + data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, l_seg_count * extent); cur_win = solo_module->dynamic_win; } else { int seg_count = count; @@ -226,13 +237,7 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, return MPI_SUCCESS; } - /* Set up segment count */ - int seg_count, l_seg_count; - seg_count = count / size; - l_seg_count = seg_count; - if (rank == size - 1) { - seg_count = count - rank * l_seg_count; - } + *(int *) (solo_module->ctrl_bufs[rank]) = rank; mac_coll_solo_barrier_intra(comm, module); @@ -282,14 +287,14 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, } } cur_win->w_osc_module->osc_fence(0, cur_win); - if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { + if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) { if (data_bufs != NULL) { free(data_bufs); data_bufs = NULL; } - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { + } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs); - mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent); + mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, l_seg_count * extent); } return OMPI_SUCCESS; } From f6466bcc19bc5ea31b93027e71d36dcc4ffebeb5 Mon Sep 17 00:00:00 2001 From: Xi Luo Date: Tue, 26 Nov 2019 11:39:25 -0500 Subject: [PATCH 6/7] Redo the non contiguous datatype part in SOLO module. Known problems: 1. bcast: has bug if a datatype is from MPI_Bottom 2. reduce \ allreduce : fix or remove the non contiguous support. --- ompi/mca/coll/solo/coll_solo.h | 81 +++++----- ompi/mca/coll/solo/coll_solo_allreduce.c | 143 +---------------- ompi/mca/coll/solo/coll_solo_bcast.c | 187 +---------------------- ompi/mca/coll/solo/coll_solo_module.c | 48 ------ ompi/mca/coll/solo/coll_solo_reduce.c | 150 +----------------- 5 files changed, 53 insertions(+), 556 deletions(-) diff --git a/ompi/mca/coll/solo/coll_solo.h b/ompi/mca/coll/solo/coll_solo.h index 83c7e040d02..0e378275ba4 100644 --- a/ompi/mca/coll/solo/coll_solo.h +++ b/ompi/mca/coll/solo/coll_solo.h @@ -59,12 +59,6 @@ typedef struct mca_coll_solo_module_t { /* Whether this module has been lazily initialized or not yet */ bool enabled; - /** - * osc alrogithms attach memory blocks to this bynamic window and use it to perform one-sided - * communications. - */ - MPI_Win dynamic_win; - /** * This window is created by ompi_win_allocate_shared such that each process contains a shared * memory data buffer, and this data buffer is divided into two parts - ctrl_bufs and data_bufs. @@ -101,18 +95,6 @@ mca_coll_base_module_t *mca_coll_solo_comm_query(struct ompi_communicator_t *com /* Lazily enable a module (since it involves expensive memory allocation, etc.) */ int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm); -/* Attach a memory block to the dynamic_win of a communicator */ -char **mca_coll_solo_attach_buf(mca_coll_solo_module_t * solo_module, - struct ompi_communicator_t *comm, - char *local_buf, - size_t local_buf_size); - -/* Detach a memory block from the dynamic_win of a communicator */ -void mca_coll_solo_detach_buf(mca_coll_solo_module_t * solo_module, - struct ompi_communicator_t *comm, - char *local_buf, - char ***attached_bufs); - /* Setup and initialize the static_win of a communicator */ void mca_coll_solo_setup_static_win(mca_coll_solo_module_t *solo_module, struct ompi_communicator_t *comm, @@ -135,12 +117,6 @@ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module); - int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count, struct ompi_datatype_t *dtype, int root, @@ -148,13 +124,6 @@ int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count, mca_coll_base_module_t * module, size_t seg_size); -int mca_coll_solo_bcast_pipeline_intra_osc(void *buff, int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module, - size_t seg_size); - /* MPI_Reduce algorithms */ int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, @@ -163,12 +132,6 @@ int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_solo_reduce_ring_intra(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module); - int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, @@ -176,11 +139,6 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int cou struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module); /* MPI_Allreduce algorithms */ int mca_coll_solo_allreduce_intra(const void *sbuf, void *rbuf, int count, @@ -195,10 +153,39 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module); + +/* Solo pack to shared memory */ +static inline void mca_coll_solo_pack_to_shared(void *local_buf, void *shared_buf, struct ompi_datatype_t *dtype, int count, ptrdiff_t extent) { + if (ompi_datatype_is_predefined(dtype)) { + memcpy((char *) shared_buf, (char *) local_buf, count * extent); + } + else { + MPI_Aint pos = 0; + ompi_datatype_pack_external("external32", local_buf, count, dtype, shared_buf, count * extent, &pos); + } +} + +/* Solo unpack from shared memory */ +static inline void mca_coll_solo_unpack_from_shared(void *local_buf, void *shared_buf, struct ompi_datatype_t *dtype, int count, ptrdiff_t extent) { + if (ompi_datatype_is_predefined(dtype)) { + memcpy((char *) local_buf, (char *) shared_buf, count * extent); + } + else { + MPI_Aint pos = 0; + ompi_datatype_unpack_external("external32", shared_buf, count * extent, &pos, local_buf, count, dtype); + } +} + +/* Solo copy from source to target */ +static inline void mca_coll_solo_copy(void *source, void *target, struct ompi_datatype_t *dtype, int count, ptrdiff_t extent) { + if (ompi_datatype_is_predefined(dtype)) { + memcpy(target, source, count * extent); + } + else { + ompi_datatype_copy_content_same_ddt(dtype, count, target, source); + } + return; +} + END_C_DECLS #endif /* MCA_COLL_SOLO_EXPORT_H */ diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c index 7637b1cad29..446495afa68 100644 --- a/ompi/mca/coll/solo/coll_solo_allreduce.c +++ b/ompi/mca/coll/solo/coll_solo_allreduce.c @@ -18,14 +18,7 @@ int mca_coll_solo_allreduce_intra(const void *sbuf, void *rbuf, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) { - mca_coll_solo_allreduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, comm, module); - } - else { - mca_coll_solo_allreduce_ring_intra_osc(sbuf, rbuf, count, dtype, op, comm, module); - } - return OMPI_SUCCESS; - + return mca_coll_solo_allreduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, comm, module); } @@ -91,10 +84,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int } } else { /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */ - int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count); + int seg_count = mca_coll_solo_component.mpool_large_block_size / extent; int num_segments = (count + seg_count - 1) / seg_count; int last_count = count - seg_count * (num_segments - 1); for (int i = 0; i < num_segments; i++) { @@ -122,7 +112,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int } /* At first iteration, copy local data to the solo data buffer */ if (cur == rank) { - memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent); + mca_coll_solo_copy((void *) ((char *) sbuf + cur * l_seg_count * extent), (void *) data_bufs[cur], dtype, seg_count, extent); mac_coll_solo_barrier_intra(comm, module); } @@ -147,7 +137,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int } else { seg_count = count - i * l_seg_count; } - memcpy((char *) c, data_bufs[i], seg_count * extent); + mca_coll_solo_copy((void *) data_bufs[i], (void *) c, dtype, seg_count, extent); c = c + seg_count * extent; } mac_coll_solo_barrier_intra(comm, module); @@ -167,127 +157,4 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int } return OMPI_SUCCESS; -} - -int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module) -{ - mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; - int size = ompi_comm_size(comm); - int rank = ompi_comm_rank(comm); - int i; - ptrdiff_t extent, lower_bound; - ompi_datatype_get_extent(dtype, &lower_bound, &extent); - - /* Enable solo module if necessary */ - if (!solo_module->enabled) { - mca_coll_solo_lazy_enable(module, comm); - } - - /* Set up segment count */ - int seg_count, l_seg_count; - seg_count = count / size; - l_seg_count = seg_count; - if (rank == size - 1) { - seg_count = count - rank * l_seg_count; - } - - char **data_bufs = NULL; - int id; - MPI_Win cur_win; - char *local_buf = NULL; - if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) { - data_bufs = (char **) malloc(sizeof(char *) * size); - for (i = 0; i < size; i++) { - data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size; - } - cur_win = solo_module->static_win; - } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { - id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent); - local_buf = - mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, - l_seg_count * extent); - data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, l_seg_count * extent); - cur_win = solo_module->dynamic_win; - } else { - /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */ - int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count); - int num_segments = (count + seg_count - 1) / seg_count; - int last_count = count - seg_count * (num_segments - 1); - for (int i = 0; i < num_segments; i++) { - char *temp_sbuf = (char *)sbuf + seg_count * extent * i; - char *temp_rbuf = (char *)rbuf + seg_count * extent * i; - int temp_count = seg_count; - if (i == num_segments - 1) { - temp_count = last_count; - } - mca_coll_solo_allreduce_ring_intra_osc(temp_sbuf, temp_rbuf, temp_count, dtype, op, - comm, module); - } - return MPI_SUCCESS; - } - - *(int *) (solo_module->ctrl_bufs[rank]) = rank; - mac_coll_solo_barrier_intra(comm, module); - - int cur = rank; - for (i = 0; i < size; i++) { - if (cur != size - 1) { - seg_count = l_seg_count; - } else { - seg_count = count - cur * l_seg_count; - } - /* At first iteration, copy local data to the solo data buffer */ - if (cur == rank) { - cur_win->w_osc_module->osc_fence(0, cur_win); - cur_win->w_osc_module->osc_put((char *) sbuf + - cur * l_seg_count * extent, - seg_count, dtype, cur, - (ptrdiff_t) data_bufs[cur], seg_count, dtype, cur_win); - cur_win->w_osc_module->osc_fence(0, cur_win); - } - /* For other iterations, do operations on the solo data buffer */ - else { - cur_win->w_osc_module->osc_accumulate((char *) sbuf + - cur * l_seg_count * - extent, seg_count, dtype, cur, (ptrdiff_t) - data_bufs[cur], seg_count, dtype, op, cur_win); - cur_win->w_osc_module->osc_fence(0, cur_win); - } - cur = (cur - 1 + size) % size; - *(int *) (solo_module->ctrl_bufs[rank]) = - (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size; - mac_coll_solo_barrier_intra(comm, module); - - } - /* At last, all the processes copies data from the solo data buffer */ - char *c; - c = rbuf; - for (i = 0; i < size; i++) { - if (i != size - 1) { - seg_count = l_seg_count; - } else { - seg_count = count - i * l_seg_count; - } - cur_win->w_osc_module->osc_get(c, seg_count, dtype, i, - (ptrdiff_t) data_bufs[i], seg_count, dtype, cur_win); - c = c + seg_count * extent; - } - cur_win->w_osc_module->osc_fence(0, cur_win); - if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) { - if (data_bufs != NULL) { - free(data_bufs); - data_bufs = NULL; - } - } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { - mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs); - mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, l_seg_count * extent); - } - return OMPI_SUCCESS; -} +} \ No newline at end of file diff --git a/ompi/mca/coll/solo/coll_solo_bcast.c b/ompi/mca/coll/solo/coll_solo_bcast.c index 88d1cc12ee4..55224d846bf 100644 --- a/ompi/mca/coll/solo/coll_solo_bcast.c +++ b/ompi/mca/coll/solo/coll_solo_bcast.c @@ -17,13 +17,7 @@ int mca_coll_solo_bcast_intra(void *buff, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) { - mca_coll_solo_bcast_linear_intra_memcpy(buff, count, dtype, root, comm, module); - } - else { - mca_coll_solo_bcast_linear_intra_osc(buff, count, dtype, root, comm, module); - } - return OMPI_SUCCESS; + return mca_coll_solo_bcast_linear_intra_memcpy(buff, count, dtype, root, comm, module); } /* linear bcast with memcpy */ @@ -61,12 +55,12 @@ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count, /* Root copy data to the shared memory block */ if (rank == root) { - memcpy(data_buf, (char *) buff, count * extent); + mca_coll_solo_pack_to_shared(buff, (void *) data_buf, dtype, count, extent); } mac_coll_solo_barrier_intra(comm, module); /* Other processes copy data from the shared memory block */ if (rank != root) { - memcpy((char *) buff, data_buf, count * extent); + mca_coll_solo_unpack_from_shared(buff, (void *) data_buf, dtype, count, extent); } mac_coll_solo_barrier_intra(comm, module); if ((size_t) count * extent > mca_coll_solo_component.static_block_size && @@ -78,72 +72,6 @@ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count, return OMPI_SUCCESS; } -int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module) -{ - mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; - - int rank = ompi_comm_rank(comm); - ptrdiff_t extent, lower_bound; - ompi_datatype_get_extent(dtype, &lower_bound, &extent); - /* Enable solo module if necessary */ - if (!solo_module->enabled) { - mca_coll_solo_lazy_enable(module, comm); - } - /* Init the data_buf - shared among all the processes */ - int id = 0; - char **attached_bufs = NULL; - MPI_Win cur_win; - char *data_buf; - if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) { - data_buf = (char *) 0 + 4 * opal_cache_line_size; - cur_win = solo_module->static_win; - } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_small_block_size) { - if (rank == root) { - id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent); - data_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, - count * extent); - attached_bufs = mca_coll_solo_attach_buf(solo_module, comm, data_buf, count * extent); - } else { - attached_bufs = mca_coll_solo_attach_buf(solo_module, comm, NULL, 0); - } - data_buf = attached_bufs[root]; - cur_win = solo_module->dynamic_win; - } else { - return mca_coll_solo_bcast_pipeline_intra_osc(buff, count, dtype, root, comm, module, - mca_coll_solo_component.mpool_small_block_size); - } - - /* Root copy to shared memory */ - cur_win->w_osc_module->osc_fence(0, cur_win); - if (rank == root) { - cur_win->w_osc_module->osc_put(buff, count, dtype, root, (ptrdiff_t) data_buf, count, dtype, - cur_win); - } - cur_win->w_osc_module->osc_fence(0, cur_win); - /* Other processes copy data from shared memory */ - if (rank != root) { - cur_win->w_osc_module->osc_get(buff, count, dtype, root, (ptrdiff_t) data_buf, count, dtype, - cur_win); - } - cur_win->w_osc_module->osc_fence(0, cur_win); - - if ((size_t) count * extent > mca_coll_solo_component.static_block_size && - (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) { - if (rank == root) { - mca_coll_solo_detach_buf(solo_module, comm, data_buf, &attached_bufs); - mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent); - } else { - mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs); - } - } - - return OMPI_SUCCESS; -} - int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count, struct ompi_datatype_t *dtype, int root, @@ -175,10 +103,7 @@ int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count, seg_size); } - int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); - COLL_BASE_COMPUTED_SEGCOUNT(seg_size, typelng, seg_count); + int seg_count = seg_size / extent; int num_segments = (count + seg_count - 1) / seg_count; int last_count = count - seg_count * (num_segments - 1); @@ -188,12 +113,12 @@ int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count, if (i == 0) { /* In the first iteration, root copies data to the current shared memory block */ if (rank == root) { - memcpy(data_bufs[cur], (char *) buff, seg_count * extent); + mca_coll_solo_pack_to_shared(buff, (void *) data_bufs[cur], dtype, seg_count, extent); } } else if ( i == num_segments) { /* In the last iteration, other processes copy data from the previous shared memory block */ - memcpy(((char *) buff) + seg_count * extent * (i - 1), data_bufs[pre], last_count * extent); + mca_coll_solo_unpack_from_shared(((char *) buff) + seg_count * extent * (i - 1), (void *) data_bufs[pre], dtype, last_count, extent); } else { /** @@ -205,10 +130,10 @@ int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count, if ( i == num_segments - 1) { temp_count = last_count; } - memcpy(data_bufs[cur], ((char *) buff) + seg_count * extent * i, temp_count * extent); + mca_coll_solo_pack_to_shared(((char *) buff) + seg_count * extent * i, data_bufs[cur], dtype, temp_count, extent); } else { - memcpy(((char *) buff) + seg_count * extent * (i - 1), data_bufs[pre], seg_count * extent); + mca_coll_solo_unpack_from_shared(((char *) buff) + seg_count * extent * (i - 1), (void *) data_bufs[pre], dtype, seg_count, extent); } } mac_coll_solo_barrier_intra(comm, module); @@ -223,99 +148,3 @@ int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count, return OMPI_SUCCESS; } - -int mca_coll_solo_bcast_pipeline_intra_osc(void *buff, int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module, - size_t seg_size) -{ - mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; - - int rank = ompi_comm_rank(comm); - ptrdiff_t extent, lower_bound; - ompi_datatype_get_extent(dtype, &lower_bound, &extent); - /* Enable solo module if necessary */ - if (!solo_module->enabled) { - mca_coll_solo_lazy_enable(module, comm); - } - /* Init the data_bufs - shared among all the processes, needs two for the pipelining */ - int ids[2]; - char **attached_bufs[2]; - MPI_Win cur_win = solo_module->dynamic_win; - char *data_bufs[2]; - int i; - for (i = 0; i < 2; i++) { - if (rank == root) { - ids[i] = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, seg_size); - data_bufs[i] = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i], - seg_size); - attached_bufs[i] = mca_coll_solo_attach_buf(solo_module, comm, data_bufs[i], seg_size); - } - else { - attached_bufs[i] = mca_coll_solo_attach_buf(solo_module, comm, NULL, 0); - } - data_bufs[i] = attached_bufs[i][root]; - } - - int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); - COLL_BASE_COMPUTED_SEGCOUNT(seg_size, typelng, seg_count); - int num_segments = (count + seg_count - 1) / seg_count; - int last_count = count - seg_count * (num_segments - 1); - - cur_win->w_osc_module->osc_fence(0, cur_win); - for (i = 0; i <= num_segments; i++) { - int cur = i & 1; - int pre = !cur; - if (i == 0) { - /* In the first iteration, root copies data to the current shared memory block */ - if (rank == root) { - cur_win->w_osc_module->osc_put(buff, seg_count, dtype, root, (ptrdiff_t) data_bufs[cur], - seg_count, dtype, cur_win); - } - } - else if ( i == num_segments) { - /* In the last iteration, other processes copy data from the previous shared memory block */ - cur_win->w_osc_module->osc_get(((char *) buff) + seg_count * extent * (i - 1), - last_count, dtype, root, (ptrdiff_t) data_bufs[pre], - last_count, dtype, cur_win); - } - else { - /** - * For other iterations, root copies data to the current shared memory block and - * other proceeses copy data from the previous shared memory block. - */ - if (rank == root) { - int temp_count = seg_count; - if ( i == num_segments - 1) { - temp_count = last_count; - } - cur_win->w_osc_module->osc_put(((char *) buff) + seg_count * extent * i, - temp_count, dtype, root, (ptrdiff_t) data_bufs[cur], - temp_count, dtype, cur_win); - } - else { - cur_win->w_osc_module->osc_get(((char *) buff) + seg_count * extent * (i - 1), - seg_count, dtype, root, (ptrdiff_t) data_bufs[pre], - seg_count, dtype, cur_win); - } - } - cur_win->w_osc_module->osc_fence(0, cur_win); - } - - /* Return the data_bufs */ - for (i = 0; i < 2; i++) { - if (rank == root) { - mca_coll_solo_detach_buf(solo_module, comm, data_bufs[i], &attached_bufs[i]); - mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[i], seg_size); - } - else { - mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs[i]); - } - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/solo/coll_solo_module.c b/ompi/mca/coll/solo/coll_solo_module.c index 0b59625a0c1..4b07ed4132f 100644 --- a/ompi/mca/coll/solo/coll_solo_module.c +++ b/ompi/mca/coll/solo/coll_solo_module.c @@ -53,7 +53,6 @@ static int mca_coll_solo_module_disable(mca_coll_base_module_t * module, static void mca_coll_solo_module_construct(mca_coll_solo_module_t * module) { module->enabled = false; - module->dynamic_win = NULL; module->static_win = NULL; module->ctrl_bufs = NULL; module->data_bufs = NULL; @@ -82,9 +81,6 @@ static int mca_coll_solo_module_disable(mca_coll_base_module_t * module, // int rank = ompi_comm_rank(comm); // /* Free the windows */ - // if (m->dynamic_win != NULL) { - // ompi_win_free(m->dynamic_win); - // } // if (m->static_win != NULL) { // ompi_win_free(m->static_win); // } @@ -225,10 +221,6 @@ int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_commu mca_coll_solo_component.solo_mpool = OBJ_NEW(mca_coll_solo_mpool_t); } - /* Create the dynamic_win */ - ompi_win_create_dynamic((opal_info_t *) (&ompi_mpi_info_null), comm, - &solo_module->dynamic_win); - /* Create the static_win with shared memory allocation */ mca_coll_solo_setup_static_win(solo_module, comm, mca_coll_solo_component.static_block_size); @@ -241,46 +233,6 @@ int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_commu return OMPI_SUCCESS; } -/** - * Attach a memory block to the dynamic_win of a communicator, returns an array contains the - * addresses of all the blocks of the processes in the communicator. - * local_buf == NULL and local_buf_size == 0 means there is no block to be attached on this process. - */ -char **mca_coll_solo_attach_buf(mca_coll_solo_module_t * solo_module, - struct ompi_communicator_t *comm, - char *local_buf, size_t local_buf_size) -{ - int rank = ompi_comm_rank(comm); - int size = ompi_comm_size(comm); - - char **attached_bufs = (char **) malloc(sizeof(char *) * size); - attached_bufs[rank] = local_buf; - ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0, - MPI_DATATYPE_NULL, - attached_bufs, - 1, MPI_AINT, comm, - (mca_coll_base_module_t *) solo_module); - - solo_module->dynamic_win->w_osc_module->osc_win_attach(solo_module->dynamic_win, local_buf, - local_buf_size); - - return attached_bufs; -} - -/* Detach a memory block from the dynamic_win of a communicator */ -void mca_coll_solo_detach_buf(mca_coll_solo_module_t * solo_module, - struct ompi_communicator_t *comm, - char *local_buf, char ***attached_bufs) -{ - if (local_buf != NULL) { - solo_module->dynamic_win->w_osc_module->osc_win_detach(solo_module->dynamic_win, local_buf); - } - - free(*attached_bufs); - *attached_bufs = NULL; - return; -} - /* Setup and initialize the static_win of a communicator */ void mca_coll_solo_setup_static_win(mca_coll_solo_module_t * solo_module, struct ompi_communicator_t *comm, size_t data_buf_size) diff --git a/ompi/mca/coll/solo/coll_solo_reduce.c b/ompi/mca/coll/solo/coll_solo_reduce.c index 5ef92c960ba..0d4cbc4b612 100644 --- a/ompi/mca/coll/solo/coll_solo_reduce.c +++ b/ompi/mca/coll/solo/coll_solo_reduce.c @@ -18,15 +18,7 @@ int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) { - mca_coll_solo_reduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, root, comm, module); - } - else { - mca_coll_solo_reduce_ring_intra_osc(sbuf, rbuf, count, dtype, op, root, comm, module); - - } - return OMPI_SUCCESS; - + return mca_coll_solo_reduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, root, comm, module); } /** @@ -93,10 +85,7 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, } } else { /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */ - int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count); + int seg_count = mca_coll_solo_component.mpool_large_block_size / extent; int num_segments = (count + seg_count - 1) / seg_count; int last_count = count - seg_count * (num_segments - 1); for (int i = 0; i < num_segments; i++) { @@ -122,13 +111,12 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, } else { seg_count = count - cur * l_seg_count; } - /* At first iteration, copy local data to the solo data buffer */ + /* At first iteration, copy local data to the shared data buffer */ if (cur == rank) { - memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent); + mca_coll_solo_copy((void *) ((char *) sbuf + cur * l_seg_count * extent), (void *) data_bufs[cur], dtype, seg_count, extent); mac_coll_solo_barrier_intra(comm, module); - } - /* For other iterations, do operations on the solo data buffer */ + /* For other iterations, do operations on the shared data buffer */ else { ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent, data_bufs[cur], seg_count, dtype); @@ -150,7 +138,7 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, } else { seg_count = count - i * l_seg_count; } - memcpy((char *) c, data_bufs[i], seg_count * extent); + mca_coll_solo_copy((void *) data_bufs[i], (void *) c, dtype, seg_count, extent); c = c + seg_count * extent; } } @@ -172,129 +160,3 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, return OMPI_SUCCESS; } - -int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t * module) -{ - mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module; - int size = ompi_comm_size(comm); - int rank = ompi_comm_rank(comm); - int i; - ptrdiff_t extent, lower_bound; - ompi_datatype_get_extent(dtype, &lower_bound, &extent); - - /* Enable solo module if necessary */ - if (!solo_module->enabled) { - mca_coll_solo_lazy_enable(module, comm); - } - - /* Set up segment count */ - int seg_count, l_seg_count; - seg_count = count / size; - l_seg_count = seg_count; - if (rank == size - 1) { - seg_count = count - rank * l_seg_count; - } - - char **data_bufs = NULL; - int id; - MPI_Win cur_win; - char *local_buf = NULL; - if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) { - data_bufs = (char **) malloc(sizeof(char *) * size); - for (i = 0; i < size; i++) { - data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size; - } - cur_win = solo_module->static_win; - } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { - id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent); - local_buf = - mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, - l_seg_count * extent); - data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, l_seg_count * extent); - cur_win = solo_module->dynamic_win; - } else { - int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count); - int num_segments = (count + seg_count - 1) / seg_count; - int last_count = count - seg_count * (num_segments - 1); - for (int i = 0; i < num_segments; i++) { - char *temp_sbuf = (char *)sbuf + seg_count * extent * i; - char *temp_rbuf = (char *)rbuf + seg_count * extent * i; - int temp_count = seg_count; - if (i == num_segments - 1) { - temp_count = last_count; - } - mca_coll_solo_reduce_ring_intra_osc(temp_sbuf, temp_rbuf, temp_count, dtype, op, - root, comm, module); - } - return MPI_SUCCESS; - } - - - *(int *) (solo_module->ctrl_bufs[rank]) = rank; - mac_coll_solo_barrier_intra(comm, module); - - int cur = rank; - for (i = 0; i < size; i++) { - if (cur != size - 1) { - seg_count = l_seg_count; - } else { - seg_count = count - cur * l_seg_count; - } - /* At first iteration, copy local data to the solo data buffer */ - if (cur == rank) { - cur_win->w_osc_module->osc_fence(0, cur_win); - cur_win->w_osc_module->osc_put((char *) sbuf + - cur * l_seg_count * extent, - seg_count, dtype, cur, - (ptrdiff_t) data_bufs[cur], seg_count, dtype, cur_win); - cur_win->w_osc_module->osc_fence(0, cur_win); - } - /* For other iterations, do operations on the solo data buffer */ - else { - cur_win->w_osc_module->osc_accumulate((char *) sbuf + - cur * l_seg_count * - extent, seg_count, dtype, cur, (ptrdiff_t) - data_bufs[cur], seg_count, dtype, op, cur_win); - cur_win->w_osc_module->osc_fence(0, cur_win); - } - cur = (cur - 1 + size) % size; - *(int *) (solo_module->ctrl_bufs[rank]) = - (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size; - mac_coll_solo_barrier_intra(comm, module); - - } - /* At last, root copies data from the solo data buffer */ - if (rank == root) { - char *c; - c = rbuf; - for (i = 0; i < size; i++) { - if (i != size - 1) { - seg_count = l_seg_count; - } else { - seg_count = count - i * l_seg_count; - } - cur_win->w_osc_module->osc_get(c, seg_count, dtype, i, (ptrdiff_t) - data_bufs[i], seg_count, dtype, cur_win); - c = c + seg_count * extent; - } - } - cur_win->w_osc_module->osc_fence(0, cur_win); - if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) { - if (data_bufs != NULL) { - free(data_bufs); - data_bufs = NULL; - } - } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) { - mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs); - mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, l_seg_count * extent); - } - return OMPI_SUCCESS; -} From 8e4e553c954a3ef9a436e2d36a97cd854b9cb790 Mon Sep 17 00:00:00 2001 From: Xi Luo Date: Tue, 24 Dec 2019 12:07:32 -0500 Subject: [PATCH 7/7] Add MPI_IN_PLACE support in SOLO module --- ompi/mca/coll/solo/coll_solo.h | 8 ++++---- ompi/mca/coll/solo/coll_solo_allreduce.c | 22 ++++++++++++++++++---- ompi/mca/coll/solo/coll_solo_reduce.c | 24 ++++++++++++++++++++---- 3 files changed, 42 insertions(+), 12 deletions(-) diff --git a/ompi/mca/coll/solo/coll_solo.h b/ompi/mca/coll/solo/coll_solo.h index 0e378275ba4..be32d159ef8 100644 --- a/ompi/mca/coll/solo/coll_solo.h +++ b/ompi/mca/coll/solo/coll_solo.h @@ -160,8 +160,8 @@ static inline void mca_coll_solo_pack_to_shared(void *local_buf, void *shared_bu memcpy((char *) shared_buf, (char *) local_buf, count * extent); } else { - MPI_Aint pos = 0; - ompi_datatype_pack_external("external32", local_buf, count, dtype, shared_buf, count * extent, &pos); + int pos = 0; + MPI_Pack(local_buf, count, dtype, shared_buf, count * extent, &pos, MPI_COMM_SELF); } } @@ -171,8 +171,8 @@ static inline void mca_coll_solo_unpack_from_shared(void *local_buf, void *share memcpy((char *) local_buf, (char *) shared_buf, count * extent); } else { - MPI_Aint pos = 0; - ompi_datatype_unpack_external("external32", shared_buf, count * extent, &pos, local_buf, count, dtype); + int pos = 0; + MPI_Unpack(shared_buf, count * extent, &pos, local_buf, count, dtype, MPI_COMM_SELF); } } diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c index 446495afa68..e850ff92c0a 100644 --- a/ompi/mca/coll/solo/coll_solo_allreduce.c +++ b/ompi/mca/coll/solo/coll_solo_allreduce.c @@ -18,7 +18,12 @@ int mca_coll_solo_allreduce_intra(const void *sbuf, void *rbuf, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - return mca_coll_solo_allreduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, comm, module); + if (ompi_op_is_commute(op)) { + return mca_coll_solo_allreduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, comm, module); + } + else { + return ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count, dtype, op, comm, module); + } } @@ -88,7 +93,11 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int int num_segments = (count + seg_count - 1) / seg_count; int last_count = count - seg_count * (num_segments - 1); for (int i = 0; i < num_segments; i++) { - char *temp_sbuf = (char *)sbuf + seg_count * extent * i; + char *temp_sbuf; + if (sbuf == MPI_IN_PLACE) + temp_sbuf = MPI_IN_PLACE; + else + temp_sbuf = (char *)sbuf + seg_count * extent * i; char *temp_rbuf = (char *)rbuf + seg_count * extent * i; int temp_count = seg_count; if (i == num_segments - 1) { @@ -100,6 +109,11 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int return MPI_SUCCESS; } + char *sbuf_temp = (char *)sbuf; + if( sbuf == MPI_IN_PLACE ) { + sbuf_temp = (char *)rbuf; + } + *(int *) (solo_module->ctrl_bufs[rank]) = rank; mac_coll_solo_barrier_intra(comm, module); @@ -112,13 +126,13 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int } /* At first iteration, copy local data to the solo data buffer */ if (cur == rank) { - mca_coll_solo_copy((void *) ((char *) sbuf + cur * l_seg_count * extent), (void *) data_bufs[cur], dtype, seg_count, extent); + mca_coll_solo_copy((void *) ((char *) sbuf_temp + cur * l_seg_count * extent), (void *) data_bufs[cur], dtype, seg_count, extent); mac_coll_solo_barrier_intra(comm, module); } /* For other iterations, do operations on the solo data buffer */ else { - ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent, + ompi_op_reduce(op, (char *) sbuf_temp + cur * l_seg_count * extent, data_bufs[cur], seg_count, dtype); mac_coll_solo_barrier_intra(comm, module); } diff --git a/ompi/mca/coll/solo/coll_solo_reduce.c b/ompi/mca/coll/solo/coll_solo_reduce.c index 0d4cbc4b612..8b498ccc98e 100644 --- a/ompi/mca/coll/solo/coll_solo_reduce.c +++ b/ompi/mca/coll/solo/coll_solo_reduce.c @@ -18,7 +18,12 @@ int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - return mca_coll_solo_reduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, root, comm, module); + if (ompi_op_is_commute(op)) { + return mca_coll_solo_reduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, root, comm, module); + } + else { + return ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype, op, root, comm, module); + } } /** @@ -46,6 +51,7 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int size = ompi_comm_size(comm); int rank = ompi_comm_rank(comm); int i; + ptrdiff_t extent, lower_bound; ompi_datatype_get_extent(dtype, &lower_bound, &extent); @@ -89,7 +95,11 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int num_segments = (count + seg_count - 1) / seg_count; int last_count = count - seg_count * (num_segments - 1); for (int i = 0; i < num_segments; i++) { - char *temp_sbuf = (char *)sbuf + seg_count * extent * i; + char *temp_sbuf; + if (sbuf == MPI_IN_PLACE) + temp_sbuf = MPI_IN_PLACE; + else + temp_sbuf = (char *)sbuf + seg_count * extent * i; char *temp_rbuf = (char *)rbuf + seg_count * extent * i; int temp_count = seg_count; if (i == num_segments - 1) { @@ -101,6 +111,11 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, return MPI_SUCCESS; } + char *sbuf_temp = (char *)sbuf; + if( sbuf == MPI_IN_PLACE ) { + sbuf_temp = (char *)rbuf; + } + *(int *) (solo_module->ctrl_bufs[rank]) = rank; mac_coll_solo_barrier_intra(comm, module); @@ -113,12 +128,13 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, } /* At first iteration, copy local data to the shared data buffer */ if (cur == rank) { - mca_coll_solo_copy((void *) ((char *) sbuf + cur * l_seg_count * extent), (void *) data_bufs[cur], dtype, seg_count, extent); + mca_coll_solo_copy((void *) ((char *) sbuf_temp + cur * l_seg_count * extent), + (void *) data_bufs[cur], dtype, seg_count, extent); mac_coll_solo_barrier_intra(comm, module); } /* For other iterations, do operations on the shared data buffer */ else { - ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent, + ompi_op_reduce(op, (char *) sbuf_temp + cur * l_seg_count * extent, data_bufs[cur], seg_count, dtype); mac_coll_solo_barrier_intra(comm, module); }