From 37ac5c8121bab5f0127134113603371b9c6a5b96 Mon Sep 17 00:00:00 2001
From: Xi Luo <dycz0fx@gmail.com>
Date: Mon, 14 Oct 2019 16:34:14 -0400
Subject: [PATCH 1/7] Solo: a shared memory collective module

---
 ompi/mca/coll/solo/Makefile.am           |  43 +++
 ompi/mca/coll/solo/coll_solo.h           | 190 ++++++++++++++
 ompi/mca/coll/solo/coll_solo_allreduce.c | 274 +++++++++++++++++++
 ompi/mca/coll/solo/coll_solo_barrier.c   |  37 +++
 ompi/mca/coll/solo/coll_solo_bcast.c     | 147 +++++++++++
 ompi/mca/coll/solo/coll_solo_component.c | 148 +++++++++++
 ompi/mca/coll/solo/coll_solo_module.c    | 320 +++++++++++++++++++++++
 ompi/mca/coll/solo/coll_solo_mpool.c     | 233 +++++++++++++++++
 ompi/mca/coll/solo/coll_solo_mpool.h     |  96 +++++++
 ompi/mca/coll/solo/coll_solo_reduce.c    | 279 ++++++++++++++++++++
 10 files changed, 1767 insertions(+)
 create mode 100644 ompi/mca/coll/solo/Makefile.am
 create mode 100644 ompi/mca/coll/solo/coll_solo.h
 create mode 100644 ompi/mca/coll/solo/coll_solo_allreduce.c
 create mode 100644 ompi/mca/coll/solo/coll_solo_barrier.c
 create mode 100644 ompi/mca/coll/solo/coll_solo_bcast.c
 create mode 100644 ompi/mca/coll/solo/coll_solo_component.c
 create mode 100644 ompi/mca/coll/solo/coll_solo_module.c
 create mode 100644 ompi/mca/coll/solo/coll_solo_mpool.c
 create mode 100644 ompi/mca/coll/solo/coll_solo_mpool.h
 create mode 100644 ompi/mca/coll/solo/coll_solo_reduce.c
diff --git a/ompi/mca/coll/solo/Makefile.am b/ompi/mca/coll/solo/Makefile.am
new file mode 100644
index 00000000000..36f095efa86
--- /dev/null
+++ b/ompi/mca/coll/solo/Makefile.am
@@ -0,0 +1,43 @@
+#
+# Copyright (c) 2019      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+sources = \
+        coll_solo.h \
+        coll_solo_mpool.h \
+        coll_solo_barrier.c \
+        coll_solo_reduce.c \
+        coll_solo_bcast.c \
+        coll_solo_allreduce.c \
+        coll_solo_component.c \
+        coll_solo_module.c \
+        coll_solo_mpool.c
+
+# Make the output library in this directory, and name it either
+# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
+# (for static builds).
+
+component_noinst =
+component_install =
+if MCA_BUILD_ompi_coll_solo_DSO
+component_install += mca_coll_solo.la
+else
+component_noinst += libmca_coll_solo.la
+endif
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_coll_solo_la_SOURCES = $(sources)
+mca_coll_solo_la_LDFLAGS = -module -avoid-version
+mca_coll_solo_la_LIBADD = 
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_coll_solo_la_SOURCES =$(sources)
+libmca_coll_solo_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/coll/solo/coll_solo.h b/ompi/mca/coll/solo/coll_solo.h
new file mode 100644
index 00000000000..4d64c63cde0
--- /dev/null
+++ b/ompi/mca/coll/solo/coll_solo.h
@@ -0,0 +1,190 @@
+/**
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_COLL_SOLO_EXPORT_H
+#define MCA_COLL_SOLO_EXPORT_H
+
+#include "ompi_config.h"
+
+#include "mpi.h"
+#include "ompi/mca/mca.h"
+#include "ompi/mca/coll/coll.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/win/win.h"
+#include "ompi/include/mpi.h"
+#include "ompi/mca/coll/base/coll_base_functions.h"
+#include "opal/util/info.h"
+#include "ompi/op/op.h"
+#include "opal/runtime/opal_progress.h"
+#include "ompi/mca/pml/pml.h"
+#include "ompi/mca/coll/base/coll_tags.h"
+#include "coll_solo_mpool.h"
+
+BEGIN_C_DECLS
+/**
+ * Structure to hold the solo coll component.  First it holds the base coll component, and then 
+ * holds a bunch of solo-coll-component-specific stuff (e.g., current MCA param values).
+ */
+    typedef struct mca_coll_solo_component_t {
+    /* Base coll component */
+    mca_coll_base_component_2_0_0_t super;
+
+    /* MCA parameters */
+    /* Priority of the solo module */
+    int solo_priority;
+    /* The size of data_bufs in the static_win */
+    uint32_t static_block_size;
+    uint32_t mpool_small_block_size;
+    uint32_t mpool_small_block_num;
+    uint32_t mpool_large_block_size;
+    uint32_t mpool_large_block_num;
+
+    /* Shared memory pool */
+    mca_coll_solo_mpool_t *solo_mpool;
+} mca_coll_solo_component_t;
+
+/* Coll solo module */
+typedef struct mca_coll_solo_module_t {
+    /* Base module */
+    mca_coll_base_module_t super;
+
+    /* Whether this module has been lazily initialized or not yet */
+    bool enabled;
+
+    /**
+     * osc alrogithms attach memory blocks to this bynamic window and use it to perform one-sided 
+     * communications. 
+     */
+    MPI_Win dynamic_win;
+
+    /**
+     * This window is created by ompi_win_allocate_shared such that each process contains a shared 
+     * memory data buffer, and this data buffer is divided into two parts - ctrl_bufs and data_bufs.
+     */
+    MPI_Win static_win;
+    /** 
+     * The first 4 * opal_cache_line_size bytes in the shared memory data buffer in static_win, used
+     * to store control messages.
+     */
+    char **ctrl_bufs;
+    /** 
+     * The rest of the shared memory data buffer in static_win, which is intent to be used to 
+     * tranfer very small messages. Its size is set by static_block_size.
+     */
+    char **data_bufs;
+
+    /* Identify which ctrl_buf is currently used in mac_coll_solo_barrier_intra. */
+    int barrier_tag;
+} mca_coll_solo_module_t;
+OBJ_CLASS_DECLARATION(mca_coll_solo_module_t);
+
+/**
+ * Global component instance
+ */
+OMPI_MODULE_DECLSPEC extern mca_coll_solo_component_t mca_coll_solo_component;
+
+/**
+ * coll module functions
+ */
+int mca_coll_solo_init_query(bool enable_progress_threads, bool enable_mpi_threads);
+
+mca_coll_base_module_t *mca_coll_solo_comm_query(struct ompi_communicator_t *comm, int *priority);
+
+/* Lazily enable a module (since it involves expensive memory allocation, etc.) */
+int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm);
+
+/* Attach a memory block to the dynamic_win of a communicator */
+char **mca_coll_solo_attach_buf(mca_coll_solo_module_t * solo_module,
+                                struct ompi_communicator_t *comm,
+                                char *local_buf, 
+                                size_t local_buf_size);
+
+/* Detach a memory block from the dynamic_win of a communicator */
+void mca_coll_solo_detach_buf(mca_coll_solo_module_t * solo_module,
+                              struct ompi_communicator_t *comm,
+                              char *local_buf, 
+                              char ***attached_bufs);
+
+/* Setup and initialize the static_win of a communicator */
+void mca_coll_solo_setup_static_win(mca_coll_solo_module_t *solo_module,
+                                    struct ompi_communicator_t *comm, 
+                                    size_t data_buf_size);
+
+/* MPI_Barrier algorithms */
+int mac_coll_solo_barrier_intra(struct ompi_communicator_t *comm, 
+                                mca_coll_base_module_t * module);
+
+/* MPI_Bcast algorithms */
+int mca_coll_solo_bcast_intra(void *buff, int count,
+                              struct ompi_datatype_t *dtype, 
+                              int root,
+                              struct ompi_communicator_t *comm, 
+                              mca_coll_base_module_t * module);
+
+int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count, 
+                                            struct ompi_datatype_t *dtype, 
+                                            int root, 
+                                            struct ompi_communicator_t *comm, 
+                                            mca_coll_base_module_t * module);
+
+int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count,
+                                         struct ompi_datatype_t *dtype,
+                                         int root, 
+                                         struct ompi_communicator_t *comm, 
+                                         mca_coll_base_module_t * module);
+
+/* MPI_Reduce algorithms */
+int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count,
+                               struct ompi_datatype_t *dtype,
+                               struct ompi_op_t *op,
+                               int root,
+                               struct ompi_communicator_t *comm, 
+                               mca_coll_base_module_t * module);
+
+int mca_coll_solo_reduce_ring_intra(const void *sbuf, void *rbuf, int count,
+                                    struct ompi_datatype_t *dtype,
+                                    struct ompi_op_t *op, int root,
+                                    struct ompi_communicator_t *comm,
+                                    mca_coll_base_module_t * module);
+
+int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int count,
+                                           struct ompi_datatype_t *dtype,
+                                           struct ompi_op_t *op,
+                                           int root, 
+                                           struct ompi_communicator_t
+                                           *comm, mca_coll_base_module_t * module);
+
+int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, int count,
+                                        struct ompi_datatype_t *dtype,
+                                        struct ompi_op_t *op, int root,
+                                        struct ompi_communicator_t *comm,
+                                        mca_coll_base_module_t * module);
+
+/* MPI_Allreduce algorithms */
+int mca_coll_solo_allreduce_intra(const void *sbuf, void *rbuf, int count,
+                                  struct ompi_datatype_t *dtype,
+                                  struct ompi_op_t *op,
+                                  struct ompi_communicator_t *comm,
+                                  mca_coll_base_module_t * module);
+
+int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int count, 
+                                              struct ompi_datatype_t *dtype, 
+                                              struct ompi_op_t *op, 
+                                              struct ompi_communicator_t *comm, 
+                                              mca_coll_base_module_t * module);
+
+int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int count, 
+                                           struct ompi_datatype_t *dtype, 
+                                           struct ompi_op_t *op, 
+                                           struct ompi_communicator_t *comm, 
+                                           mca_coll_base_module_t * module);
+END_C_DECLS
+#endif                          /* MCA_COLL_SOLO_EXPORT_H */
diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c
new file mode 100644
index 00000000000..383d28b66d6
--- /dev/null
+++ b/ompi/mca/coll/solo/coll_solo_allreduce.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "coll_solo.h"
+
+int mca_coll_solo_allreduce_intra(const void *sbuf, void *rbuf,
+                                    int count,
+                                    struct ompi_datatype_t *dtype,
+                                    struct ompi_op_t *op,
+                                    struct ompi_communicator_t *comm,
+                                    mca_coll_base_module_t * module)
+{
+    if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) {
+        mca_coll_solo_allreduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, comm, module);
+    }
+    else {
+        mca_coll_solo_allreduce_ring_intra_osc(sbuf, rbuf, count, dtype, op, comm, module);
+    }
+    return OMPI_SUCCESS;
+
+}
+
+
+/**
+ * Each process operates a part of the shared data buffer in turn.
+ * Suppose the number of processes is 4.
+ * Step 1:
+ * |  P0  |  P1  |  P2  |  P3  |
+ * Step 2:
+ * |  P1  |  P2  |  P3  |  P0  |
+ * Step 3:
+ * |  P2  |  P3  |  P0  |  P1  |
+ * Step 4:
+ * |  P3  |  P0  |  P1  |  P2  |
+ * At last, all the processes copy data back from the shared data buffer.
+ */
+int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int count, 
+                                              struct ompi_datatype_t *dtype, 
+                                              struct ompi_op_t *op, 
+                                              struct ompi_communicator_t *comm, 
+                                              mca_coll_base_module_t * module)
+{
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+    int size = ompi_comm_size(comm);
+    int rank = ompi_comm_rank(comm);
+    int i;
+    ptrdiff_t extent, lower_bound;
+    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
+
+    /* Enable solo module if necessary */
+    if (!solo_module->enabled) {
+        mca_coll_solo_lazy_enable(module, comm);
+    }
+
+    char **data_bufs = NULL;
+    int *ids = NULL;
+    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+        data_bufs = solo_module->data_bufs;
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        data_bufs = (char **) malloc(sizeof(char *) * size);
+        ids = (int *) malloc(sizeof(int) * size);
+        ids[rank] =
+            mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
+
+        ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0,
+                                                         MPI_DATATYPE_NULL,
+                                                         ids,
+                                                         1, MPI_INT, comm,
+                                                         (mca_coll_base_module_t *)
+                                                         solo_module);
+        for (i = 0; i < size; i++) {
+            data_bufs[i] =
+                mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i],
+                                                count * extent);
+        }
+    } else {
+        //printf("TOO BIG\n");
+    }
+
+    /* Set up segment count */
+    int seg_count, l_seg_count;
+    seg_count = count / size;
+    l_seg_count = seg_count;
+    if (rank == size - 1) {
+        seg_count = count - rank * l_seg_count;
+    }
+    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
+    *(int *) (solo_module->ctrl_bufs[rank]) = rank;
+    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
+    mac_coll_solo_barrier_intra(comm, module);
+
+    int cur = rank;
+    for (i = 0; i < size; i++) {
+        if (cur != size - 1) {
+            seg_count = l_seg_count;
+        } else {
+            seg_count = count - cur * l_seg_count;
+        }
+        /* At first iteration, copy local data to the solo data buffer */
+        if (cur == rank) {
+            //cur_win->w_osc_module->osc_fence(0, cur_win);
+            memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent);
+            //cur_win->w_osc_module->osc_fence(0, cur_win);
+            mac_coll_solo_barrier_intra(comm, module);
+
+        }
+        /* For other iterations, do operations on the solo data buffer */
+        else {
+            ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent,
+                           data_bufs[cur], seg_count, dtype);
+            //cur_win->w_osc_module->osc_fence(0,cur_win);
+            mac_coll_solo_barrier_intra(comm, module);
+        }
+        cur = (cur - 1 + size) % size;
+        *(int *) (solo_module->ctrl_bufs[rank]) =
+            (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size;
+        //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
+        mac_coll_solo_barrier_intra(comm, module);
+
+    }
+    /* At last, all the processes copy data from the solo data buffer */
+    char *c;
+    c = rbuf;
+    for (i = 0; i < size; i++) {
+        if (i != size - 1) {
+            seg_count = l_seg_count;
+        } else {
+            seg_count = count - i * l_seg_count;
+        }
+        memcpy((char *) c, data_bufs[i], seg_count * extent);
+        c = c + seg_count * extent;
+    }
+    //cur_win->w_osc_module->osc_fence(0, cur_win);
+    mac_coll_solo_barrier_intra(comm, module);
+    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+        ;
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[rank],
+                                     count * extent);
+        if (ids != NULL) {
+            free(ids);
+            ids = NULL;
+        }
+
+        if (data_bufs != NULL) {
+            free(data_bufs);
+            data_bufs = NULL;
+        }
+
+    } else {
+        //printf("TOO BIG\n");
+    }
+
+
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int count,
+                                             struct ompi_datatype_t *dtype,
+                                             struct ompi_op_t *op, 
+                                             struct ompi_communicator_t *comm, 
+                                             mca_coll_base_module_t * module)
+{
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+    int size = ompi_comm_size(comm);
+    int rank = ompi_comm_rank(comm);
+    int i;
+    ptrdiff_t extent, lower_bound;
+    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
+
+    /* Enable solo module if necessary */
+    if (!solo_module->enabled) {
+        mca_coll_solo_lazy_enable(module, comm);
+    }
+    char **data_bufs = NULL;
+    int id;
+    MPI_Win cur_win;
+    char *local_buf = NULL;
+    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+        data_bufs = (char **) malloc(sizeof(char *) * size);
+        for (i = 0; i < size; i++) {
+            data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size;
+        }
+        cur_win = solo_module->static_win;
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
+        local_buf =
+            mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id,
+                                            count * extent);
+        data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent);
+        cur_win = solo_module->dynamic_win;
+    } else {
+        //printf("TOO BIG\n");
+    }
+
+    /* Set up segment count */
+    int seg_count, l_seg_count;
+    seg_count = count / size;
+    l_seg_count = seg_count;
+    if (rank == size - 1) {
+        seg_count = count - rank * l_seg_count;
+    }
+    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
+    *(int *) (solo_module->ctrl_bufs[rank]) = rank;
+    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
+    mac_coll_solo_barrier_intra(comm, module);
+
+    int cur = rank;
+    for (i = 0; i < size; i++) {
+        if (cur != size - 1) {
+            seg_count = l_seg_count;
+        } else {
+            seg_count = count - cur * l_seg_count;
+        }
+        /* At first iteration, copy local data to the solo data buffer */
+        if (cur == rank) {
+            cur_win->w_osc_module->osc_fence(0, cur_win);
+            cur_win->w_osc_module->osc_put((char *) sbuf +
+                                           cur * l_seg_count * extent,
+                                           seg_count, dtype, cur,
+                                           (ptrdiff_t) data_bufs[cur], seg_count, dtype, cur_win);
+            cur_win->w_osc_module->osc_fence(0, cur_win);
+        }
+        /* For other iterations, do operations on the solo data buffer */
+        else {
+            cur_win->w_osc_module->osc_accumulate((char *) sbuf +
+                                                  cur * l_seg_count *
+                                                  extent, seg_count, dtype, cur, (ptrdiff_t)
+                                                  data_bufs[cur], seg_count, dtype, op, cur_win);
+            cur_win->w_osc_module->osc_fence(0, cur_win);
+        }
+        cur = (cur - 1 + size) % size;
+        *(int *) (solo_module->ctrl_bufs[rank]) =
+            (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size;
+        //solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win);
+        mac_coll_solo_barrier_intra(comm, module);
+
+    }
+    /* At last, all the processes copies data from the solo data buffer */
+    char *c;
+    c = rbuf;
+    for (i = 0; i < size; i++) {
+        if (i != size - 1) {
+            seg_count = l_seg_count;
+        } else {
+            seg_count = count - i * l_seg_count;
+        }
+        cur_win->w_osc_module->osc_get(c, seg_count, dtype, i,
+                                       (ptrdiff_t) data_bufs[i], seg_count, dtype, cur_win);
+        c = c + seg_count * extent;
+    }
+    cur_win->w_osc_module->osc_fence(0, cur_win);
+    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+        if (data_bufs != NULL) {
+            free(data_bufs);
+            data_bufs = NULL;
+        }
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs);
+        mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent);
+    } else {
+        //printf("TOO BIG\n");
+    }
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/coll/solo/coll_solo_barrier.c b/ompi/mca/coll/solo/coll_solo_barrier.c
new file mode 100644
index 00000000000..26777e92acd
--- /dev/null
+++ b/ompi/mca/coll/solo/coll_solo_barrier.c
@@ -0,0 +1,37 @@
+/**
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "coll_solo.h"
+int mac_coll_solo_barrier_intra(struct ompi_communicator_t *comm, mca_coll_base_module_t * module)
+{
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+
+    /* Enable solo module if necessary */
+    if (!solo_module->enabled) {
+        mca_coll_solo_lazy_enable(module, comm);
+    }
+
+    int rank = ompi_comm_rank(comm);
+    /* Atomic add to current ctrl_buf */
+    char *barrier_ctrl_bufs = solo_module->ctrl_bufs[0] + opal_cache_line_size;
+    opal_atomic_add_fetch_32((opal_atomic_int32_t *) (barrier_ctrl_bufs + solo_module->barrier_tag * opal_cache_line_size), 1);
+    while (*((int32_t *) (barrier_ctrl_bufs + (solo_module->barrier_tag) * opal_cache_line_size)) != ompi_comm_size(comm)) {
+        opal_progress();
+    }
+
+    /* Set previous used ctrl_buf to 0 */
+    if (rank == 0) {
+        *((int32_t *) (barrier_ctrl_bufs + ((solo_module->barrier_tag + 2) % 3) * opal_cache_line_size)) = 0;
+    }
+    /* Set barrier_tag to next ctrl_buf */
+    solo_module->barrier_tag = (solo_module->barrier_tag + 1) % 3;
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/coll/solo/coll_solo_bcast.c b/ompi/mca/coll/solo/coll_solo_bcast.c
new file mode 100644
index 00000000000..e0482f609cf
--- /dev/null
+++ b/ompi/mca/coll/solo/coll_solo_bcast.c
@@ -0,0 +1,147 @@
+/**
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "coll_solo.h"
+
+int mca_coll_solo_bcast_intra(void *buff, int count,
+                              struct ompi_datatype_t *dtype, 
+                              int root,
+                              struct ompi_communicator_t *comm, 
+                              mca_coll_base_module_t * module)
+{
+    if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) {
+        mca_coll_solo_bcast_linear_intra_memcpy(buff, count, dtype, root, comm, module);
+    }
+    else {
+        mca_coll_solo_bcast_linear_intra_osc(buff, count, dtype, root, comm, module);
+    }
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count,
+                                            struct ompi_datatype_t *dtype, 
+                                            int root,
+                                            struct ompi_communicator_t *comm, 
+                                            mca_coll_base_module_t * module)
+{
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+
+    int rank = ompi_comm_rank(comm);
+    int id;
+    ptrdiff_t extent, lower_bound;
+    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
+    /* Enable solo module if necessary */
+    if (!solo_module->enabled) {
+        mca_coll_solo_lazy_enable(module, comm);
+    }
+    char *data_buf;
+    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+        data_buf = solo_module->data_bufs[root];
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        if (rank == root) {
+            id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
+        }
+        mca_coll_solo_bcast_linear_intra_memcpy(&id, 1, MPI_INT, root, comm, module);
+        data_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, 
+                                                 count * extent);
+    } else {
+        /* TODO: Add support for very large messages */
+        //printf("TOO BIG\n");
+    }
+
+    //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win);
+    if (rank == root) {
+        memcpy(data_buf, (char *) buff, count * extent);
+    }
+    //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win);
+    mac_coll_solo_barrier_intra(comm, module);
+    if (rank != root) {
+        memcpy((char *) buff, data_buf, count * extent);
+    }
+    //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win);
+    mac_coll_solo_barrier_intra(comm, module);
+    if ((size_t) count * extent > mca_coll_solo_component.static_block_size &&
+        (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        if (rank == root) {
+            mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent);
+        }
+    } else {
+        /* TODO: Add support for very large messages */
+        //printf("TOO BIG\n");
+    }
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count,
+                                         struct ompi_datatype_t *dtype, 
+                                         int root,
+                                         struct ompi_communicator_t *comm, 
+                                         mca_coll_base_module_t * module)
+{
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+
+    int rank = ompi_comm_rank(comm);
+    int id = 0;
+    char **attached_bufs = NULL;
+    MPI_Win cur_win;
+    ptrdiff_t extent, lower_bound;
+    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
+    /* Enable solo module if necessary */
+    if (!solo_module->enabled) {
+        mca_coll_solo_lazy_enable(module, comm);
+    }
+    char *data_buf;
+    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+        data_buf = (char *) 0 + 4 * opal_cache_line_size;
+        cur_win = solo_module->static_win;
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        if (rank == root) {
+            id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
+            data_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id,
+                                                     count * extent);
+            attached_bufs = mca_coll_solo_attach_buf(solo_module, comm, data_buf, count * extent);
+        } else {
+            attached_bufs = mca_coll_solo_attach_buf(solo_module, comm, NULL, 0);
+        }
+        data_buf = attached_bufs[root];
+        cur_win = solo_module->dynamic_win;
+    } else {
+        //printf("TOO BIG\n");
+    }
+
+    /* Root copy to shared memory */
+    cur_win->w_osc_module->osc_fence(0, cur_win);
+    if (rank == root) {
+        cur_win->w_osc_module->osc_put(buff, count, dtype, root, (ptrdiff_t) data_buf, count, dtype, 
+                                       cur_win);
+    }
+    cur_win->w_osc_module->osc_fence(0, cur_win);
+    /* Other processes copy data from shared memory */
+    if (rank != root) {
+        cur_win->w_osc_module->osc_get(buff, count, dtype, root, (ptrdiff_t) data_buf, count, dtype,
+                                       cur_win);
+    }
+    cur_win->w_osc_module->osc_fence(0, cur_win);
+
+    if ((size_t) count * extent > mca_coll_solo_component.static_block_size &&
+        (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        if (rank == root) {
+            mca_coll_solo_detach_buf(solo_module, comm, data_buf, &attached_bufs);
+            mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent);
+        } else {
+            mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs);
+        }
+    } else {
+        //printf("TOO BIG\n");
+    }
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/coll/solo/coll_solo_component.c b/ompi/mca/coll/solo/coll_solo_component.c
new file mode 100644
index 00000000000..34116ba5d85
--- /dev/null
+++ b/ompi/mca/coll/solo/coll_solo_component.c
@@ -0,0 +1,148 @@
+/**
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "opal/util/show_help.h"
+#include "ompi/constants.h"
+#include "ompi/mca/coll/coll.h"
+#include "coll_solo.h"
+
+
+/**
+ * Public string showing the coll ompi_solo component version number
+ */
+const char *mca_coll_solo_component_version_string =
+    "Open MPI solo collective MCA component version " OMPI_VERSION;
+
+/**
+ * Local functions
+ */
+static int solo_close(void);
+static int solo_register(void);
+
+/**
+ * Instantiate the public struct with all of our public information
+ * and pointers to our public functions in it
+ */
+mca_coll_solo_component_t mca_coll_solo_component = {
+
+    /* First, fill in the super */
+
+    {
+     /* First, the mca_component_t struct containing meta
+        information about the component itself */
+     .collm_version = {
+                       MCA_COLL_BASE_VERSION_2_0_0,
+
+                       /* Component name and version */
+                       .mca_component_name = "solo",
+                       MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION,
+                                             OMPI_MINOR_VERSION,
+                                             OMPI_RELEASE_VERSION),
+
+                       /* Component functions */
+                       .mca_close_component = solo_close,
+                       .mca_register_component_params = solo_register,
+                       },
+     .collm_data = {
+                    /* The component is not checkpoint ready */
+                    MCA_BASE_METADATA_PARAM_NONE},
+
+     /* Initialization / querying functions */
+     .collm_init_query = mca_coll_solo_init_query,
+     .collm_comm_query = mca_coll_solo_comm_query,
+     },
+
+    /* Shared-component specifc information */
+
+    /* (default) priority */
+    0,
+    /* (default) static_block_size */
+    4096,
+    /* (default) mpool_small_block_size */
+    1048576,
+    /* (default) mpool_small_block_num */
+    0,
+    /* (default) mpool_large_block_size */
+    8388608,
+    /* (default) mpool_large_block_num */
+    0,
+    /* (default) pointer to the shared mpool */
+    NULL
+};
+
+/**
+ * Shut down the component
+ */
+static int solo_close(void)
+{
+    return OMPI_SUCCESS;
+}
+
+/**
+ * Register MCA params
+ */
+static int solo_register(void)
+{
+    mca_base_component_t *c = &mca_coll_solo_component.super.collm_version;
+    mca_coll_solo_component_t *cs = &mca_coll_solo_component;
+
+    /**
+     * If we want to be selected (i.e., all procs on one node), then we should have a high 
+     * priority.
+     */
+    cs->solo_priority = 0;
+    (void) mca_base_component_var_register(c, "priority",
+                                           "Priority of the solo coll component",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0,
+                                           0, OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->solo_priority);
+
+    cs->static_block_size = 4096;
+    (void) mca_base_component_var_register(c, "static_block_size",
+                                           "static block size of the static window",
+                                           MCA_BASE_VAR_TYPE_UINT32_T, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->static_block_size);
+
+    cs->mpool_small_block_size = 1048576;
+    (void) mca_base_component_var_register(c, "mpool_small_block_size",
+                                           "small block size of the mpool",
+                                           MCA_BASE_VAR_TYPE_UINT32_T, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &cs->mpool_small_block_size);
+
+    cs->mpool_small_block_num = 0;
+    (void) mca_base_component_var_register(c, "mpool_small_block_num",
+                                           "number of small blocks of the mpool",
+                                           MCA_BASE_VAR_TYPE_UINT32_T, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->mpool_small_block_num);
+
+    cs->mpool_large_block_size = 8388608;
+    (void) mca_base_component_var_register(c, "mpool_large_block_size",
+                                           "large block size of the mpool",
+                                           MCA_BASE_VAR_TYPE_UINT32_T, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &cs->mpool_large_block_size);
+
+    cs->mpool_large_block_num = 0;
+    (void) mca_base_component_var_register(c, "mpool_large_block_num",
+                                           "number of large blocks of the mpool",
+                                           MCA_BASE_VAR_TYPE_UINT32_T, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->mpool_large_block_num);
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/coll/solo/coll_solo_module.c b/ompi/mca/coll/solo/coll_solo_module.c
new file mode 100644
index 00000000000..49651a81acd
--- /dev/null
+++ b/ompi/mca/coll/solo/coll_solo_module.c
@@ -0,0 +1,320 @@
+/**
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <stdio.h>
+#include <string.h>
+#ifdef HAVE_SCHED_H
+#include <sched.h>
+#endif
+#include <sys/types.h>
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif                          /* HAVE_SYS_MMAN_H */
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif                          /* HAVE_UNISTD_H */
+
+#include "mpi.h"
+#include "opal_stdint.h"
+#include "opal/mca/hwloc/base/base.h"
+#include "opal/util/os_path.h"
+
+#include "ompi/communicator/communicator.h"
+#include "ompi/group/group.h"
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/coll/base/base.h"
+#include "ompi/mca/rte/rte.h"
+#include "ompi/proc/proc.h"
+#include "coll_solo.h"
+#include "ompi/mca/coll/base/coll_base_functions.h"
+#include "ompi/mca/coll/base/coll_tags.h"
+#include "ompi/mca/pml/pml.h"
+
+
+/**
+ * Local functions
+ */
+static int mca_coll_solo_module_enable(mca_coll_base_module_t * module,
+                                       struct ompi_communicator_t *comm);
+static int mca_coll_solo_module_disable(mca_coll_base_module_t * module, 
+                                        struct ompi_communicator_t *comm);
+
+/* solo module constructor */
+static void mca_coll_solo_module_construct(mca_coll_solo_module_t * module)
+{
+    module->enabled = false;
+    module->dynamic_win = NULL;
+    module->static_win = NULL;
+    module->ctrl_bufs = NULL;
+    module->data_bufs = NULL;
+    module->barrier_tag = 0;
+    module->super.coll_module_disable = mca_coll_solo_module_disable;
+}
+
+/* solo module destructor */
+static void mca_coll_solo_module_destruct(mca_coll_solo_module_t * module)
+{
+    return;
+}
+
+/* Disable solo module */
+static int mca_coll_solo_module_disable(mca_coll_base_module_t * module,
+                                        struct ompi_communicator_t *comm)
+{
+    if (module->base_data != NULL) {
+        OBJ_RELEASE(module->base_data);
+    }
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+    solo_module->enabled = false;
+
+    /* If comm is MPI_COMM_WORLD, windows will be free at ompi_mpi_finalize.c:320 ompi_win_finalize() */
+    // if (comm != MPI_COMM_WORLD) {
+    //     int rank = ompi_comm_rank(comm);
+
+    //     /* Free the windows */
+    //     if (m->dynamic_win != NULL) {
+    //         ompi_win_free(m->dynamic_win);
+    //     }
+    //     if (m->static_win != NULL) {
+    //         ompi_win_free(m->static_win);
+    //     }
+    // }
+
+    if (solo_module->ctrl_bufs != NULL) {
+        free(solo_module->ctrl_bufs);
+        solo_module->ctrl_bufs = NULL;
+    }
+
+    if (solo_module->data_bufs != NULL) {
+        free(solo_module->data_bufs);
+        solo_module->data_bufs = NULL;
+    }
+
+    return OMPI_SUCCESS;
+}
+
+OBJ_CLASS_INSTANCE(mca_coll_solo_module_t,
+                   mca_coll_base_module_t,
+                   mca_coll_solo_module_construct, mca_coll_solo_module_destruct);
+
+/**
+ * Initial query function that is invoked during MPI_INIT, allowing this component to disqualify 
+ * itself if it doesn't support the required level of thread support.  This function is invoked 
+ * exactly once.
+ */
+int mca_coll_solo_init_query(bool enable_progress_threads, bool enable_mpi_threads)
+{
+    /* if no session directory was created, then we cannot be used */
+    if (NULL == ompi_process_info.job_session_dir) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+    /* Don't do much here because we don't really want to allocate any
+       shared memory until this component is selected to be used. */
+    opal_output_verbose(10, ompi_coll_base_framework.framework_output,
+                        "coll:solo:init_query: pick me! pick me!");
+    return OMPI_SUCCESS;
+}
+
+
+/**
+ * Invoked when there's a new communicator that has been created.
+ * Look at the communicator and decide which set of functions and
+ * priority we want to return.
+ */
+mca_coll_base_module_t *mca_coll_solo_comm_query(struct ompi_communicator_t * comm, int *priority)
+{
+    mca_coll_solo_module_t *solo_module;
+
+    /**
+     * If we're intercomm, or if there's only one process in the communicator, or if not all the 
+     * processes in the communicator are not on this node, then we don't want to run.
+     */
+    if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm)
+        || ompi_group_have_remote_peers(comm->c_local_group)) {
+        opal_output_verbose(10, ompi_coll_base_framework.framework_output,
+                            "coll:solo:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself",
+                            comm->c_contextid, comm->c_name);
+        return NULL;
+    }
+
+    /* Get the priority level attached to this module. If priority is less
+     * than or equal to 0, then the module is unavailable. */
+    *priority = mca_coll_solo_component.solo_priority;
+    if (0 >= mca_coll_solo_component.solo_priority) {
+        opal_output_verbose(10, ompi_coll_base_framework.framework_output,
+                            "coll:solo:comm_query (%d/%s): priority too low; disqualifying myself",
+                            comm->c_contextid, comm->c_name);
+        return NULL;
+    }
+
+    solo_module = OBJ_NEW(mca_coll_solo_module_t);
+    if (NULL == solo_module) {
+        return NULL;
+    }
+
+    /* All is good -- return a module */
+    solo_module->super.coll_module_enable = mca_coll_solo_module_enable;
+    solo_module->super.ft_event = NULL;
+    solo_module->super.coll_allgather = NULL;
+    solo_module->super.coll_allgatherv = NULL;
+    solo_module->super.coll_allreduce = mca_coll_solo_allreduce_intra;
+    solo_module->super.coll_alltoall = NULL;
+    solo_module->super.coll_alltoallv = NULL;
+    solo_module->super.coll_alltoallw = NULL;
+    solo_module->super.coll_barrier = mac_coll_solo_barrier_intra;
+    solo_module->super.coll_bcast = mca_coll_solo_bcast_intra;
+    solo_module->super.coll_exscan = NULL;
+    solo_module->super.coll_gather = NULL;
+    solo_module->super.coll_gatherv = NULL;
+    solo_module->super.coll_reduce = mca_coll_solo_reduce_intra;
+    solo_module->super.coll_reduce_scatter = NULL;
+    solo_module->super.coll_scan = NULL;
+    solo_module->super.coll_scatter = NULL;
+    solo_module->super.coll_scatterv = NULL;
+
+    opal_output_verbose(10, ompi_coll_base_framework.framework_output,
+                        "coll:solo:comm_query (%d/%s): pick me! pick me!",
+                        comm->c_contextid, comm->c_name);
+    return &(solo_module->super);
+}
+
+/* Init the solo module on the communicator */
+static int mca_coll_solo_module_enable(mca_coll_base_module_t * module,
+                                         struct ompi_communicator_t *comm)
+{
+    /* prepare the placeholder for the array of request for invoking base module */
+    module->base_data = OBJ_NEW(mca_coll_base_comm_t);
+    if (NULL == module->base_data) {
+        return OMPI_ERROR;
+    }
+    return OMPI_SUCCESS;
+}
+
+/* Enable the solo module on the communicator lazily */
+int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm)
+{
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+
+    /**
+     * Temporarily use tuned module to prevent the collective operations in this module are invoked
+     * before the initialization. 
+     */
+    int var_id;
+    int tmp_priority = 100;
+    const int *origin_priority = NULL;
+    int tmp_origin = 0;
+    mca_base_var_find_by_name("coll_tuned_priority", &var_id);
+    mca_base_var_get_value(var_id, &origin_priority, NULL, NULL);
+    tmp_origin = *origin_priority;
+    mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
+    mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL);
+    comm->c_coll->coll_allreduce = ompi_coll_base_allreduce_intra_recursivedoubling;
+
+    /* Create the mpool */
+    if (mca_coll_solo_component.solo_mpool == NULL) {
+        mca_coll_solo_component.solo_mpool = OBJ_NEW(mca_coll_solo_mpool_t);
+    }
+
+    /* Create the dynamic_win */
+    ompi_win_create_dynamic((opal_info_t *) (&ompi_mpi_info_null), comm,
+                            &solo_module->dynamic_win);
+
+    /* Create the static_win with shared memory allocation */
+    mca_coll_solo_setup_static_win(solo_module, comm,
+                                     mca_coll_solo_component.static_block_size);
+
+    solo_module->enabled = true;
+
+    /* Set the functions and the priority back */
+    comm->c_coll->coll_allreduce = mca_coll_solo_allreduce_intra;
+    mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL);
+    return OMPI_SUCCESS;
+}
+
+/**
+ * Attach a memory block to the dynamic_win of a communicator, returns an array contains the 
+ * addresses of all the blocks of the processes in the communicator.
+ * local_buf == NULL and local_buf_size == 0 means there is no block to be attached on this process.
+ */
+char **mca_coll_solo_attach_buf(mca_coll_solo_module_t * solo_module,
+                                  struct ompi_communicator_t *comm,
+                                  char *local_buf, size_t local_buf_size)
+{
+    int rank = ompi_comm_rank(comm);
+    int size = ompi_comm_size(comm);
+
+    char **attached_bufs = (char **) malloc(sizeof(char *) * size);
+    attached_bufs[rank] = local_buf;
+    ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0,
+                                                     MPI_DATATYPE_NULL,
+                                                     attached_bufs,
+                                                     1, MPI_AINT, comm,
+                                                     (mca_coll_base_module_t *) solo_module);
+
+    solo_module->dynamic_win->w_osc_module->osc_win_attach(solo_module->dynamic_win, local_buf,
+                                                             local_buf_size);
+
+    return attached_bufs;
+}
+
+/* Detach a memory block from the dynamic_win of a communicator */
+void mca_coll_solo_detach_buf(mca_coll_solo_module_t * solo_module,
+                                struct ompi_communicator_t *comm,
+                                char *local_buf, char ***attached_bufs)
+{
+    if (local_buf != NULL) {
+        solo_module->dynamic_win->w_osc_module->osc_win_detach(solo_module->dynamic_win, local_buf);
+    }
+
+    free(*attached_bufs);
+    *attached_bufs = NULL;
+    return;
+}
+
+/* Setup and initialize the static_win of a communicator */
+void mca_coll_solo_setup_static_win(mca_coll_solo_module_t * solo_module,
+                                    struct ompi_communicator_t *comm, size_t data_buf_size)
+{
+    int i;
+    int rank = ompi_comm_rank(comm);
+    int size = ompi_comm_size(comm);
+    int *baseptr;
+    /* Create the static win */ 
+    ompi_win_allocate_shared(4 * opal_cache_line_size + data_buf_size,
+                             sizeof(char),
+                             (opal_info_t *) (&ompi_mpi_info_null), comm,
+                             &baseptr, &solo_module->static_win);
+    size_t static_size[size];
+    int static_disp[size];
+    solo_module->ctrl_bufs = (char **) malloc(sizeof(char *) * size);
+    solo_module->data_bufs = (char **) malloc(sizeof(char *) * size);
+    /** 
+     * Get the shared memory address created with the static window, 
+     * the first 4 * opal_cache_line_size is used for control messages,
+     * the rest is used for transfer very small messages.
+     */
+    for (i = 0; i < size; i++) {
+        solo_module->static_win->w_osc_module->osc_win_shared_query(solo_module->static_win, i,
+                                                                    &(static_size[i]),
+                                                                    &(static_disp[i]),
+                                                                    &(solo_module->ctrl_bufs[i]));
+        solo_module->data_bufs[i] = (char *) (solo_module->ctrl_bufs[i]) + 4 * opal_cache_line_size;
+    }
+    /* Init ctrl_bufs with 0s */
+    solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win);
+    for (i = 0; i < 4; i++) {
+        char *ptr = solo_module->ctrl_bufs[rank] + i * opal_cache_line_size;
+        *((int32_t *) ptr) = 0;
+    }
+    solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win);
+}
diff --git a/ompi/mca/coll/solo/coll_solo_mpool.c b/ompi/mca/coll/solo/coll_solo_mpool.c
new file mode 100644
index 00000000000..7dc7479a2a1
--- /dev/null
+++ b/ompi/mca/coll/solo/coll_solo_mpool.c
@@ -0,0 +1,233 @@
+/**
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "coll_solo.h"
+
+static void mca_coll_solo_queue_construct(mca_coll_solo_queue_t * queue);
+static void mca_coll_solo_queue_destruct(mca_coll_solo_queue_t * queue);
+
+/* queue constructor */
+static void mca_coll_solo_queue_construct(mca_coll_solo_queue_t * queue)
+{
+    return;
+}
+
+/* queue destructor */
+static void mca_coll_solo_queue_destruct(mca_coll_solo_queue_t * queue)
+{
+    return;
+}
+
+OBJ_CLASS_INSTANCE(mca_coll_solo_queue_t, opal_object_t, mca_coll_solo_queue_construct, 
+                   mca_coll_solo_queue_destruct);
+
+/* Init the queue with node-wise communicator, number of blocks and size of each block. */
+void mca_coll_solo_queue_init(mca_coll_solo_queue_t * queue, ompi_communicator_t * node_comm, 
+                              int block_num, int block_size)
+{
+    int node_rank = ompi_comm_rank(node_comm);
+    queue->block_size = block_size;
+    queue->block_num = block_num;
+    int *temp_ptr;
+    int id_queue_size = opal_cache_line_size * (block_num + 3);
+    if (node_rank == 0) {
+        ompi_win_allocate_shared(block_size * block_num + id_queue_size, sizeof(char),
+                                 (opal_info_t *) (&ompi_mpi_info_null),
+                                 node_comm, &temp_ptr, &(queue->win));
+    } else {
+        ompi_win_allocate_shared(0, sizeof(char),
+                                 (opal_info_t *) (&ompi_mpi_info_null),
+                                 node_comm, &temp_ptr, &(queue->win));
+    }
+    size_t temp_size;
+    int temp_disp;
+    /* Get the address of the shared memory */
+    queue->win->w_osc_module->osc_win_shared_query(queue->win, 0, &temp_size, &temp_disp, 
+                                                   &queue->blocks);
+    /* Set up the queue as shown in the coll_shared_mpool.h */
+    queue->id_queue = queue->blocks + block_size * block_num;
+    queue->head = queue->id_queue + opal_cache_line_size * (block_num + 1);
+    queue->tail = queue->id_queue + opal_cache_line_size * (block_num + 2);
+    queue->win->w_osc_module->osc_fence(0, queue->win);
+    if (node_rank == 0) {
+        (*((mca_coll_solo_tag_t *) queue->head)).id = 0;
+        (*((mca_coll_solo_tag_t *) queue->head)).ref = 0;
+        *((COLL_SOLO_WORD *) queue->tail) = block_num;
+        int i;
+        for (i = 0; i < block_num + 1; i++) {
+            char *temp = queue->id_queue + opal_cache_line_size * i;
+            *((COLL_SOLO_WORD *) temp) = i + 1;
+            if (i == block_num) {
+                *((COLL_SOLO_WORD *) temp) = 0;
+            }
+        }
+    }
+    queue->win->w_osc_module->osc_fence(0, queue->win);
+    return;
+}
+
+/*
+ * Request a block from the queue
+ */
+int mca_coll_solo_queue_request(mca_coll_solo_queue_t * queue)
+{
+    COLL_SOLO_DWORD cur_head, new_head;
+    COLL_SOLO_WORD cur_tail;
+
+    do {
+        cur_head = *((COLL_SOLO_DWORD *) queue->head);
+        cur_tail = *((COLL_SOLO_WORD *) queue->tail);
+        if (((mca_coll_solo_tag_t *) &cur_head)->id == cur_tail) {
+            return -1;
+        }
+        new_head = cur_head;
+        ((mca_coll_solo_tag_t *) &new_head)->id = (((mca_coll_solo_tag_t *) &new_head)->id + 1) % 
+                                                   (queue->block_num + 1);
+        ((mca_coll_solo_tag_t *) &new_head)->ref = ((mca_coll_solo_tag_t *) &new_head)->ref + 1;
+    } while (!opal_atomic_compare_exchange_strong_64((COLL_SOLO_DWORD *) queue->head, 
+                                                     &cur_head, new_head));
+    char *temp = queue->id_queue + opal_cache_line_size * ((mca_coll_solo_tag_t *) &cur_head)->id;
+    COLL_SOLO_WORD id = *((COLL_SOLO_WORD *) temp);
+    *((COLL_SOLO_WORD *) temp) = 0;
+    return id;
+}
+
+/* 
+ * Calculate block address based on block id
+ */
+char *mca_coll_solo_queue_calculate(mca_coll_solo_queue_t * queue, int id)
+{
+    return queue->blocks + queue->block_size * (id - 1);
+}
+
+/*
+ * Return a block to the queue
+ */
+void mca_coll_solo_queue_return(mca_coll_solo_queue_t * queue, int id)
+{
+    COLL_SOLO_WORD cur_tail;
+    char *temp;
+    int32_t zero = 0;
+    do {
+        zero = 0;
+        cur_tail = *((COLL_SOLO_WORD *) queue->tail);
+        temp = queue->id_queue + opal_cache_line_size * cur_tail;
+    } while (!opal_atomic_compare_exchange_strong_32((COLL_SOLO_WORD *) temp, &zero, id));
+    opal_atomic_compare_exchange_strong_32((COLL_SOLO_WORD *) queue->tail, &cur_tail,
+                                           (cur_tail + 1) % (queue->block_num + 1));
+    return;
+}
+
+
+/* mpool classes */
+static void mca_coll_solo_mpool_construct(mca_coll_solo_mpool_t * mpool);
+static void mca_coll_solo_mpool_destruct(mca_coll_solo_mpool_t * mpool);
+
+OBJ_CLASS_INSTANCE(mca_coll_solo_mpool_t, opal_object_t, mca_coll_solo_mpool_construct, 
+                   mca_coll_solo_mpool_destruct);
+
+/* mpool constructor */
+static void mca_coll_solo_mpool_construct(mca_coll_solo_mpool_t * mpool)
+{
+    /* Create the node_comm which contains all the processes on a node */
+    ompi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
+                         (opal_info_t *) (&ompi_mpi_info_null), &(mpool->node_comm));
+    int node_size = ompi_comm_size(mpool->node_comm);
+    /* Create the queues */
+    mpool->small_queue = OBJ_NEW(mca_coll_solo_queue_t);
+    mpool->large_queue = OBJ_NEW(mca_coll_solo_queue_t);
+    /* verify the mca parameters */
+    if (mca_coll_solo_component.mpool_small_block_size > 
+        mca_coll_solo_component.mpool_large_block_size) {
+        uint32_t temp = mca_coll_solo_component.mpool_small_block_size;
+        mca_coll_solo_component.mpool_small_block_size = 
+            mca_coll_solo_component.mpool_large_block_size;
+        mca_coll_solo_component.mpool_large_block_size = temp;
+    }
+    if (mca_coll_solo_component.mpool_small_block_num < (uint32_t) node_size) {
+        if (mca_coll_solo_component.mpool_small_block_num == 0) {
+            mca_coll_solo_component.mpool_small_block_num = node_size * 4;
+        }
+        else {
+            mca_coll_solo_component.mpool_small_block_num = node_size;
+        }
+    }
+    if (mca_coll_solo_component.mpool_large_block_num < (uint32_t) node_size) {
+        if (mca_coll_solo_component.mpool_large_block_num == 0) {
+            mca_coll_solo_component.mpool_large_block_num = node_size * 2;
+        }
+        else {
+            mca_coll_solo_component.mpool_large_block_num = node_size;
+        }
+    }
+    /* Init the queues */
+    mca_coll_solo_queue_init(mpool->small_queue, mpool->node_comm,
+                             mca_coll_solo_component.mpool_small_block_num, 
+                             mca_coll_solo_component.mpool_small_block_size);
+    mca_coll_solo_queue_init(mpool->large_queue, mpool->node_comm,
+                             mca_coll_solo_component.mpool_large_block_num,
+                             mca_coll_solo_component.mpool_large_block_size);
+    return;
+}
+
+/* mpool destructor */
+static void mca_coll_solo_mpool_destruct(mca_coll_solo_mpool_t * mpool)
+{
+    OBJ_RELEASE(mpool->small_queue);
+    OBJ_RELEASE(mpool->large_queue);
+    return;
+}
+
+/* Request block from the memory pool */
+int mca_coll_solo_mpool_request(mca_coll_solo_mpool_t * mpool, size_t len)
+{
+    if (len > mca_coll_solo_component.mpool_large_block_size) {
+        return -1;
+    }
+    int id = -1;
+    while (id == -1) {
+        if (len <= mca_coll_solo_component.mpool_small_block_size) {
+            id = mca_coll_solo_queue_request(mpool->small_queue);
+        } else {
+            id = mca_coll_solo_queue_request(mpool->large_queue);
+        }
+    }
+    return id;
+}
+
+/* Calculate block address */
+char *mca_coll_solo_mpool_calculate(mca_coll_solo_mpool_t * mpool, int id, size_t len)
+{
+    if (id <= 0 || len > mca_coll_solo_component.mpool_large_block_size) {
+        return NULL;
+    }
+    char *addr;
+    if (len <= mca_coll_solo_component.mpool_small_block_size) {
+        addr = mca_coll_solo_queue_calculate(mpool->small_queue, id);
+    } else {
+        addr = mca_coll_solo_queue_calculate(mpool->large_queue, id);
+    }
+    return addr;
+}
+
+/* Return block to memory pool */
+void mca_coll_solo_mpool_return(mca_coll_solo_mpool_t * mpool, int id, size_t len)
+{
+    if (len <= mca_coll_solo_component.mpool_small_block_size) {
+        mca_coll_solo_queue_return(mpool->small_queue, id);
+    } else if (len <= mca_coll_solo_component.mpool_large_block_size) {
+        mca_coll_solo_queue_return(mpool->large_queue, id);
+    } else {
+        opal_output_verbose(10, ompi_coll_base_framework.framework_output, 
+                            "coll:solo:mca_coll_solo_mpool_return: block size is wrong!");
+    }
+    return;
+}
diff --git a/ompi/mca/coll/solo/coll_solo_mpool.h b/ompi/mca/coll/solo/coll_solo_mpool.h
new file mode 100644
index 00000000000..695ebfb0dca
--- /dev/null
+++ b/ompi/mca/coll/solo/coll_solo_mpool.h
@@ -0,0 +1,96 @@
+/**
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+
+#include "opal/class/opal_object.h"
+#include "opal/class/opal_hash_table.h"
+#include "opal/class/opal_list.h"
+#include "opal/threads/threads.h"
+
+#define COLL_SOLO_DWORD int64_t
+#define COLL_SOLO_WORD int32_t
+typedef struct {
+    /* the block id */
+    COLL_SOLO_WORD id;
+    /* ref is added to resolve the potential ABA problem */
+    COLL_SOLO_WORD ref;
+} mca_coll_solo_tag_t;
+
+/**
+ * A lock-free array-based queue containing the blocks which can be accessed by any processes 
+ * on the same node. 
+ * An example of the queue is shown below (block_num is n, the size of each element in id queue, 
+ * head and tail is opal_cache_line_size to avoid false sharing):
+ * Init:
+ * |                        blocks                        |        id queue       | head  | tail |
+ * | block1 (avail) | block2 (avail) |...| blockn (avail) | 1 | 2 | 3 |...| n | 0 |  0/0  |   n  |
+ * Request a block - 0 in the id queue means it is not available:
+ * | block1 (using) | block2 (avail) |...| blockn (avail) | 0 | 2 | 3 |...| n | 0 |  1/1  |   n  |
+ * Request another block:
+ * | block1 (using) | block2 (using) |...| blockn (avail) | 0 | 0 | 3 |...| n | 0 |  2/2  |   n  |
+ * Return block 2:
+ * | block1 (using) | block2 (avail) |...| blockn (avail) | 0 | 0 | 3 |...| n | 2 |  2/2  |   0  |
+ */
+struct mca_coll_solo_queue_t {
+    /* the start address of blocks */
+    char *blocks;
+    /* the number of blocks */
+    int block_num;
+    /* the size of each block */
+    size_t block_size;
+    /* the start address of id queue */
+    char *id_queue;
+    /* the address of head */
+    char *head;
+    /* the address of tail */
+    char *tail;
+    /* a node-wise window */
+    MPI_Win win;
+};
+
+typedef struct mca_coll_solo_queue_t mca_coll_solo_queue_t;
+
+OBJ_CLASS_DECLARATION(mca_coll_solo_queue_t);
+
+/* Init the queue */
+void mca_coll_solo_queue_init(mca_coll_solo_queue_t * queue, ompi_communicator_t * node_comm, 
+                             int block_num, int block_size);
+/* Request a block from the queue, return a block id */
+int mca_coll_solo_queue_request(mca_coll_solo_queue_t * queue);
+/* Calculate the block address with a block id */
+char *mca_coll_solo_queue_calculate(mca_coll_solo_queue_t * queue, int id);
+/* Return a block to the queue */
+void mca_coll_solo_queue_return(mca_coll_solo_queue_t * queue, int id);
+
+/* Each node has a shared memory pool, which contains two queues of different block sizes.*/
+struct mca_coll_solo_mpool_t {
+    /* Generic parent class for all Open MPI objects */
+    opal_object_t super;
+    /* An array-based queue contains small blocks */
+    mca_coll_solo_queue_t *small_queue;
+    /* An array-based queue contains large blocks */
+    mca_coll_solo_queue_t *large_queue;
+    /* A communicator contains all the processes on a node */
+    ompi_communicator_t *node_comm;
+};
+
+typedef struct mca_coll_solo_mpool_t mca_coll_solo_mpool_t;
+
+OBJ_CLASS_DECLARATION(mca_coll_solo_mpool_t);
+
+/* Request block from memory pool */
+int mca_coll_solo_mpool_request(mca_coll_solo_mpool_t * mpool, size_t len);
+
+/* Calculate block address */
+char *mca_coll_solo_mpool_calculate(mca_coll_solo_mpool_t * mpool, int id, size_t len);
+
+/* Return block to memory pool */
+void mca_coll_solo_mpool_return(mca_coll_solo_mpool_t * mpool, int id, size_t len);
diff --git a/ompi/mca/coll/solo/coll_solo_reduce.c b/ompi/mca/coll/solo/coll_solo_reduce.c
new file mode 100644
index 00000000000..1e2cab430ef
--- /dev/null
+++ b/ompi/mca/coll/solo/coll_solo_reduce.c
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "coll_solo.h"
+
+int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count,
+                                 struct ompi_datatype_t *dtype,
+                                 struct ompi_op_t *op,
+                                 int root,
+                                 struct ompi_communicator_t *comm, 
+                                 mca_coll_base_module_t * module)
+{
+    if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) {
+        mca_coll_solo_reduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, root, comm, module);
+    }
+    else {
+        mca_coll_solo_reduce_ring_intra_osc(sbuf, rbuf, count, dtype, op, root, comm, module);
+
+    }
+    return OMPI_SUCCESS;
+
+}
+
+/**
+ * Each process operates a part of the shared data buffer in turn.
+ * Suppose the number of processes is 4.
+ * Step 1:
+ * |  P0  |  P1  |  P2  |  P3  |
+ * Step 2:
+ * |  P1  |  P2  |  P3  |  P0  |
+ * Step 3:
+ * |  P2  |  P3  |  P0  |  P1  |
+ * Step 4:
+ * |  P3  |  P0  |  P1  |  P2  |
+ * At last, root copies data back from the shared data buffer.
+ */
+int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
+                                             int count,
+                                             struct ompi_datatype_t *dtype,
+                                             struct ompi_op_t *op,
+                                             int root, struct ompi_communicator_t
+                                             *comm, mca_coll_base_module_t * module)
+{
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+    int size = ompi_comm_size(comm);
+    int rank = ompi_comm_rank(comm);
+    int i;
+    ptrdiff_t extent, lower_bound;
+    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
+
+    /* Enable solo module if necessary */
+    if (!solo_module->enabled) {
+        mca_coll_solo_lazy_enable(module, comm);
+    }
+
+    char **data_bufs = NULL;
+    int *ids = NULL;
+    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+        data_bufs = solo_module->data_bufs;
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        data_bufs = (char **) malloc(sizeof(char *) * size);
+        ids = (int *) malloc(sizeof(int) * size);
+        ids[rank] =
+            mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
+
+        ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0,
+                                                         MPI_DATATYPE_NULL,
+                                                         ids,
+                                                         1, MPI_INT, comm,
+                                                         (mca_coll_base_module_t *)
+                                                         solo_module);
+        for (i = 0; i < size; i++) {
+            data_bufs[i] =
+                mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i],
+                                                count * extent);
+        }
+    } else {
+        //printf("TOO BIG\n");
+    }
+
+    /* Set up segment count */
+    int seg_count, l_seg_count;
+    seg_count = count / size;
+    l_seg_count = seg_count;
+    if (rank == size - 1) {
+        seg_count = count - rank * l_seg_count;
+    }
+    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
+    *(int *) (solo_module->ctrl_bufs[rank]) = rank;
+    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
+    mac_coll_solo_barrier_intra(comm, module);
+
+    int cur = rank;
+    for (i = 0; i < size; i++) {
+        if (cur != size - 1) {
+            seg_count = l_seg_count;
+        } else {
+            seg_count = count - cur * l_seg_count;
+        }
+        /* At first iteration, copy local data to the solo data buffer */
+        if (cur == rank) {
+            //cur_win->w_osc_module->osc_fence(0, cur_win);
+            memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent);
+            //cur_win->w_osc_module->osc_fence(0, cur_win);
+            mac_coll_solo_barrier_intra(comm, module);
+
+        }
+        /* For other iterations, do operations on the solo data buffer */
+        else {
+            ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent,
+                           data_bufs[cur], seg_count, dtype);
+            //cur_win->w_osc_modulbe->osc_fence(0,cur_win);
+            mac_coll_solo_barrier_intra(comm, module);
+        }
+        cur = (cur - 1 + size) % size;
+        *(int *) (solo_module->ctrl_bufs[rank]) =
+            (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size;
+        //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
+        mac_coll_solo_barrier_intra(comm, module);
+
+    }
+    /* At last, root copies data from the solo data buffer */
+    if (rank == root) {
+        char *c;
+        c = rbuf;
+        for (i = 0; i < size; i++) {
+            if (i != size - 1) {
+                seg_count = l_seg_count;
+            } else {
+                seg_count = count - i * l_seg_count;
+            }
+            memcpy((char *) c, data_bufs[i], seg_count * extent);
+            c = c + seg_count * extent;
+        }
+    }
+    //cur_win->w_osc_module->osc_fence(0, cur_win);
+    mac_coll_solo_barrier_intra(comm, module);
+    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+        ;
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[rank],
+                                     count * extent);
+        if (ids != NULL) {
+            free(ids);
+            ids = NULL;
+        }
+
+        if (data_bufs != NULL) {
+            free(data_bufs);
+            data_bufs = NULL;
+        }
+    } else {
+        //printf("TOO BIG\n");
+    }
+
+
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf,
+                                          int count,
+                                          struct ompi_datatype_t *dtype,
+                                          struct ompi_op_t *op, int root,
+                                          struct ompi_communicator_t *comm,
+                                          mca_coll_base_module_t * module)
+{
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+    int size = ompi_comm_size(comm);
+    int rank = ompi_comm_rank(comm);
+    int i;
+    ptrdiff_t extent, lower_bound;
+    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
+
+    /* Enable solo module if necessary */
+    if (!solo_module->enabled) {
+        mca_coll_solo_lazy_enable(module, comm);
+    }
+    char **data_bufs = NULL;
+    int id;
+    MPI_Win cur_win;
+    char *local_buf = NULL;
+    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+        data_bufs = (char **) malloc(sizeof(char *) * size);
+        for (i = 0; i < size; i++) {
+            data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size;
+        }
+        cur_win = solo_module->static_win;
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
+        local_buf =
+            mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id,
+                                            count * extent);
+        data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent);
+        cur_win = solo_module->dynamic_win;
+    } else {
+        //printf("TOO BIG\n");
+    }
+
+    /* Set up segment count */
+    int seg_count, l_seg_count;
+    seg_count = count / size;
+    l_seg_count = seg_count;
+    if (rank == size - 1) {
+        seg_count = count - rank * l_seg_count;
+    }
+    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
+    *(int *) (solo_module->ctrl_bufs[rank]) = rank;
+    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
+    mac_coll_solo_barrier_intra(comm, module);
+
+    int cur = rank;
+    for (i = 0; i < size; i++) {
+        if (cur != size - 1) {
+            seg_count = l_seg_count;
+        } else {
+            seg_count = count - cur * l_seg_count;
+        }
+        /* At first iteration, copy local data to the solo data buffer */
+        if (cur == rank) {
+            cur_win->w_osc_module->osc_fence(0, cur_win);
+            cur_win->w_osc_module->osc_put((char *) sbuf +
+                                           cur * l_seg_count * extent,
+                                           seg_count, dtype, cur,
+                                           (ptrdiff_t) data_bufs[cur], seg_count, dtype, cur_win);
+            cur_win->w_osc_module->osc_fence(0, cur_win);
+        }
+        /* For other iterations, do operations on the solo data buffer */
+        else {
+            cur_win->w_osc_module->osc_accumulate((char *) sbuf +
+                                                  cur * l_seg_count *
+                                                  extent, seg_count, dtype, cur, (ptrdiff_t)
+                                                  data_bufs[cur], seg_count, dtype, op, cur_win);
+            cur_win->w_osc_module->osc_fence(0, cur_win);
+        }
+        cur = (cur - 1 + size) % size;
+        *(int *) (solo_module->ctrl_bufs[rank]) =
+            (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size;
+        //solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win);
+        mac_coll_solo_barrier_intra(comm, module);
+
+    }
+    /* At last, root copies data from the solo data buffer */
+    if (rank == root) {
+        char *c;
+        c = rbuf;
+        for (i = 0; i < size; i++) {
+            if (i != size - 1) {
+                seg_count = l_seg_count;
+            } else {
+                seg_count = count - i * l_seg_count;
+            }
+            cur_win->w_osc_module->osc_get(c, seg_count, dtype, i, (ptrdiff_t)
+                                           data_bufs[i], seg_count, dtype, cur_win);
+            c = c + seg_count * extent;
+        }
+    }
+    cur_win->w_osc_module->osc_fence(0, cur_win);
+    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+        if (data_bufs != NULL) {
+            free(data_bufs);
+            data_bufs = NULL;
+        }
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs);
+        mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent);
+    } else {
+        //printf("TOO BIG\n");
+    }
+
+    return OMPI_SUCCESS;
+}

From d7f237924ff8149ec44ed1ca299ed7154ae45bfb Mon Sep 17 00:00:00 2001
From: Xi Luo <dycz0fx@gmail.com>
Date: Thu, 17 Oct 2019 14:40:22 -0400
Subject: [PATCH 2/7] Add a pipeline algorithm of broadcast to support very
 large message.

---
 ompi/mca/coll/solo/coll_solo.h        |  14 ++
 ompi/mca/coll/solo/coll_solo_bcast.c  | 208 +++++++++++++++++++++++---
 ompi/mca/coll/solo/coll_solo_module.c |   2 +-
 3 files changed, 206 insertions(+), 18 deletions(-)

diff --git a/ompi/mca/coll/solo/coll_solo.h b/ompi/mca/coll/solo/coll_solo.h
index 4d64c63cde0..83c7e040d02 100644
--- a/ompi/mca/coll/solo/coll_solo.h
+++ b/ompi/mca/coll/solo/coll_solo.h
@@ -141,6 +141,20 @@ int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count,
                                          struct ompi_communicator_t *comm, 
                                          mca_coll_base_module_t * module);
 
+int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count,
+                                              struct ompi_datatype_t *dtype, 
+                                              int root,
+                                              struct ompi_communicator_t *comm, 
+                                              mca_coll_base_module_t * module,
+                                              size_t seg_size);
+
+int mca_coll_solo_bcast_pipeline_intra_osc(void *buff, int count,
+                                           struct ompi_datatype_t *dtype, 
+                                           int root,
+                                           struct ompi_communicator_t *comm, 
+                                           mca_coll_base_module_t * module,
+                                           size_t seg_size);
+
 /* MPI_Reduce algorithms */
 int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count,
                                struct ompi_datatype_t *dtype,
diff --git a/ompi/mca/coll/solo/coll_solo_bcast.c b/ompi/mca/coll/solo/coll_solo_bcast.c
index e0482f609cf..bcddb724d56 100644
--- a/ompi/mca/coll/solo/coll_solo_bcast.c
+++ b/ompi/mca/coll/solo/coll_solo_bcast.c
@@ -26,6 +26,7 @@ int mca_coll_solo_bcast_intra(void *buff, int count,
     return OMPI_SUCCESS;
 }
 
+/* linear bcast with memcpy */
 int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count,
                                             struct ompi_datatype_t *dtype, 
                                             int root,
@@ -35,17 +36,18 @@ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count,
     mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
 
     int rank = ompi_comm_rank(comm);
-    int id;
     ptrdiff_t extent, lower_bound;
     ompi_datatype_get_extent(dtype, &lower_bound, &extent);
     /* Enable solo module if necessary */
     if (!solo_module->enabled) {
         mca_coll_solo_lazy_enable(module, comm);
     }
+    /* Init the data_buf - shared among all the processes */
+    int id;
     char *data_buf;
     if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
         data_buf = solo_module->data_bufs[root];
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_small_block_size) {
         if (rank == root) {
             id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
         }
@@ -53,29 +55,25 @@ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count,
         data_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id, 
                                                  count * extent);
     } else {
-        /* TODO: Add support for very large messages */
-        //printf("TOO BIG\n");
+        return mca_coll_solo_bcast_pipeline_intra_memcpy(buff, count, dtype, root, comm, module, 
+                                                         mca_coll_solo_component.mpool_small_block_size);
     }
 
-    //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win);
+    /* Root copy data to the shared memory block */
     if (rank == root) {
         memcpy(data_buf, (char *) buff, count * extent);
     }
-    //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win);
     mac_coll_solo_barrier_intra(comm, module);
+    /* Other processes copy data from the shared memory block */
     if (rank != root) {
         memcpy((char *) buff, data_buf, count * extent);
     }
-    //solo_module->dynamic_win->w_osc_module->osc_fence(0, solo_module->dynamic_win);
     mac_coll_solo_barrier_intra(comm, module);
     if ((size_t) count * extent > mca_coll_solo_component.static_block_size &&
         (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
         if (rank == root) {
             mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent);
         }
-    } else {
-        /* TODO: Add support for very large messages */
-        //printf("TOO BIG\n");
     }
     return OMPI_SUCCESS;
 }
@@ -89,20 +87,21 @@ int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count,
     mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
 
     int rank = ompi_comm_rank(comm);
-    int id = 0;
-    char **attached_bufs = NULL;
-    MPI_Win cur_win;
     ptrdiff_t extent, lower_bound;
     ompi_datatype_get_extent(dtype, &lower_bound, &extent);
     /* Enable solo module if necessary */
     if (!solo_module->enabled) {
         mca_coll_solo_lazy_enable(module, comm);
     }
+    /* Init the data_buf - shared among all the processes */
+    int id = 0;
+    char **attached_bufs = NULL;
+    MPI_Win cur_win;
     char *data_buf;
     if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
         data_buf = (char *) 0 + 4 * opal_cache_line_size;
         cur_win = solo_module->static_win;
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_small_block_size) {
         if (rank == root) {
             id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
             data_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id,
@@ -114,7 +113,8 @@ int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count,
         data_buf = attached_bufs[root];
         cur_win = solo_module->dynamic_win;
     } else {
-        //printf("TOO BIG\n");
+        return mca_coll_solo_bcast_pipeline_intra_osc(buff, count, dtype, root, comm, module, 
+                                                      mca_coll_solo_component.mpool_small_block_size);
     }
 
     /* Root copy to shared memory */
@@ -139,9 +139,183 @@ int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count,
         } else {
             mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs);
         }
-    } else {
-        //printf("TOO BIG\n");
+    } 
+
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count,
+                                              struct ompi_datatype_t *dtype, 
+                                              int root,
+                                              struct ompi_communicator_t *comm, 
+                                              mca_coll_base_module_t * module,
+                                              size_t seg_size)
+{
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+
+    int rank = ompi_comm_rank(comm);
+    ptrdiff_t extent, lower_bound;
+    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
+    /* Enable solo module if necessary */
+    if (!solo_module->enabled) {
+        mca_coll_solo_lazy_enable(module, comm);
+    }
+    /* Init the data_bufs - shared among all the processes, needs two for the pipelining */
+    int ids[2];
+    char *data_bufs[2];
+    int i;
+    for (i = 0; i < 2; i++) {
+        if (rank == root) {
+            ids[i] = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, seg_size);
+        }
+    }
+    mca_coll_solo_bcast_linear_intra_memcpy(ids, 2, MPI_INT, root, comm, module);
+    for (i = 0; i < 2; i++) {
+        data_bufs[i] = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i], 
+                                                     seg_size);
+    }
+    
+    int seg_count = count;
+    size_t typelng;
+    ompi_datatype_type_size(dtype, &typelng);
+    COLL_BASE_COMPUTED_SEGCOUNT(seg_size, typelng, seg_count);
+    int num_segments = (count + seg_count - 1) / seg_count;
+    int last_count = count - seg_count * (num_segments - 1);
+
+    for (i = 0; i <= num_segments; i++) {
+        int cur = i & 1;
+        int pre = !cur;
+        if (i == 0) {
+            /* In the first iteration, root copies data to the current shared memory block */
+            if (rank == root) {
+                memcpy(data_bufs[cur], (char *) buff, seg_count * extent);
+            }
+        }
+        else if ( i == num_segments) {
+            /* In the last iteration, other processes copy data from the previous shared memory block */
+            memcpy(((char *) buff) + seg_count * extent * (i - 1), data_bufs[pre], last_count * extent);
+        }
+        else {
+            /** 
+             * For other iterations, root copies data to the current shared memory block and 
+             * other proceeses copy data from the previous shared memory block.
+             */
+            if (rank == root) {
+                int temp_count = seg_count;
+                if ( i == num_segments - 1) {
+                    temp_count = last_count;
+                }
+                memcpy(data_bufs[cur], ((char *) buff) + seg_count * extent * i, temp_count * extent);
+            }
+            else {
+                memcpy(((char *) buff) + seg_count * extent * (i - 1), data_bufs[pre], seg_count * extent);
+            }
+        }
+        mac_coll_solo_barrier_intra(comm, module);
     }
 
+    /* Return the data_bufs */
+    for (i = 0; i < 2; i++) {
+        if (rank == root) {
+            mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[i], seg_size);
+        }
+    }
+    
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_solo_bcast_pipeline_intra_osc(void *buff, int count,
+                                           struct ompi_datatype_t *dtype, 
+                                           int root,
+                                           struct ompi_communicator_t *comm, 
+                                           mca_coll_base_module_t * module,
+                                           size_t seg_size) 
+{
+    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
+
+    int rank = ompi_comm_rank(comm);
+    ptrdiff_t extent, lower_bound;
+    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
+    /* Enable solo module if necessary */
+    if (!solo_module->enabled) {
+        mca_coll_solo_lazy_enable(module, comm);
+    }
+    /* Init the data_bufs - shared among all the processes, needs two for the pipelining */
+    int ids[2];
+    char **attached_bufs[2];
+    MPI_Win cur_win = solo_module->dynamic_win;
+    char *data_bufs[2];
+    int i;
+    for (i = 0; i < 2; i++) {
+        if (rank == root) {
+            ids[i] = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, seg_size);
+            data_bufs[i] = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i],
+                                                         seg_size);
+            attached_bufs[i] = mca_coll_solo_attach_buf(solo_module, comm, data_bufs[i], seg_size);
+        }
+        else {
+            attached_bufs[i] = mca_coll_solo_attach_buf(solo_module, comm, NULL, 0);
+        }
+        data_bufs[i] = attached_bufs[i][root];
+    }
+    
+    int seg_count = count;
+    size_t typelng;
+    ompi_datatype_type_size(dtype, &typelng);
+    COLL_BASE_COMPUTED_SEGCOUNT(seg_size, typelng, seg_count);
+    int num_segments = (count + seg_count - 1) / seg_count;
+    int last_count = count - seg_count * (num_segments - 1);
+
+    cur_win->w_osc_module->osc_fence(0, cur_win);
+    for (i = 0; i <= num_segments; i++) {
+        int cur = i & 1;
+        int pre = !cur;
+        if (i == 0) {
+            /* In the first iteration, root copies data to the current shared memory block */
+            if (rank == root) {
+                cur_win->w_osc_module->osc_put(buff, seg_count, dtype, root, (ptrdiff_t) data_bufs[cur],
+                                               seg_count, dtype, cur_win);
+            }
+        }
+        else if ( i == num_segments) {
+            /* In the last iteration, other processes copy data from the previous shared memory block */
+            cur_win->w_osc_module->osc_get(((char *) buff) + seg_count * extent * (i - 1), 
+                                           last_count, dtype, root, (ptrdiff_t) data_bufs[pre], 
+                                           last_count, dtype, cur_win);
+        }
+        else {
+            /** 
+             * For other iterations, root copies data to the current shared memory block and 
+             * other proceeses copy data from the previous shared memory block.
+             */
+            if (rank == root) {
+                int temp_count = seg_count;
+                if ( i == num_segments - 1) {
+                    temp_count = last_count;
+                }
+                cur_win->w_osc_module->osc_put(((char *) buff) + seg_count * extent * i, 
+                                               temp_count, dtype, root, (ptrdiff_t) data_bufs[cur],
+                                               temp_count, dtype, cur_win);
+            }
+            else {
+                cur_win->w_osc_module->osc_get(((char *) buff) + seg_count * extent * (i - 1), 
+                                               seg_count, dtype, root, (ptrdiff_t) data_bufs[pre], 
+                                               seg_count, dtype, cur_win);
+            }
+        }
+        cur_win->w_osc_module->osc_fence(0, cur_win);
+    }
+
+    /* Return the data_bufs */
+    for (i = 0; i < 2; i++) {
+        if (rank == root) {
+            mca_coll_solo_detach_buf(solo_module, comm, data_bufs[i], &attached_bufs[i]);
+            mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[i], seg_size);
+        }
+        else {
+            mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs[i]);
+        }
+    }
+    
     return OMPI_SUCCESS;
 }
diff --git a/ompi/mca/coll/solo/coll_solo_module.c b/ompi/mca/coll/solo/coll_solo_module.c
index 49651a81acd..0b59625a0c1 100644
--- a/ompi/mca/coll/solo/coll_solo_module.c
+++ b/ompi/mca/coll/solo/coll_solo_module.c
@@ -231,7 +231,7 @@ int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_commu
 
     /* Create the static_win with shared memory allocation */
     mca_coll_solo_setup_static_win(solo_module, comm,
-                                     mca_coll_solo_component.static_block_size);
+                                   mca_coll_solo_component.static_block_size);
 
     solo_module->enabled = true;
 

From 145589c81894132f27a34056f6def7122d81a7bd Mon Sep 17 00:00:00 2001
From: Xi Luo <dycz0fx@gmail.com>
Date: Thu, 17 Oct 2019 20:39:55 -0400
Subject: [PATCH 3/7] Support very large message for reduce and allreduce.

---
 ompi/mca/coll/solo/coll_solo_allreduce.c | 47 +++++++++++-----
 ompi/mca/coll/solo/coll_solo_bcast.c     |  2 +-
 ompi/mca/coll/solo/coll_solo_reduce.c    | 68 +++++++++++++++---------
 3 files changed, 77 insertions(+), 40 deletions(-)

diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c
index 383d28b66d6..4eb4b31d6d2 100644
--- a/ompi/mca/coll/solo/coll_solo_allreduce.c
+++ b/ompi/mca/coll/solo/coll_solo_allreduce.c
@@ -82,7 +82,23 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
                                                 count * extent);
         }
     } else {
-        //printf("TOO BIG\n");
+        int seg_count = count;
+        size_t typelng;
+        ompi_datatype_type_size(dtype, &typelng);
+        COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count);
+        int num_segments = (count + seg_count - 1) / seg_count;
+        int last_count = count - seg_count * (num_segments - 1);
+        for (int i = 0; i < num_segments; i++) {
+            char *temp_sbuf = (char *)sbuf + seg_count * extent * i;
+            char *temp_rbuf = (char *)rbuf + seg_count * extent * i;
+            int temp_count = seg_count;
+            if (i == num_segments - 1) {
+                temp_count = last_count;
+            }
+            mca_coll_solo_allreduce_ring_intra_memcpy(temp_sbuf, temp_rbuf, temp_count, dtype, op, 
+                                                      comm, module);
+        }
+        return MPI_SUCCESS;
     }
 
     /* Set up segment count */
@@ -92,9 +108,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
     if (rank == size - 1) {
         seg_count = count - rank * l_seg_count;
     }
-    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
     *(int *) (solo_module->ctrl_bufs[rank]) = rank;
-    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
     mac_coll_solo_barrier_intra(comm, module);
 
     int cur = rank;
@@ -106,9 +120,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         }
         /* At first iteration, copy local data to the solo data buffer */
         if (cur == rank) {
-            //cur_win->w_osc_module->osc_fence(0, cur_win);
             memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent);
-            //cur_win->w_osc_module->osc_fence(0, cur_win);
             mac_coll_solo_barrier_intra(comm, module);
 
         }
@@ -116,13 +128,11 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         else {
             ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent,
                            data_bufs[cur], seg_count, dtype);
-            //cur_win->w_osc_module->osc_fence(0,cur_win);
             mac_coll_solo_barrier_intra(comm, module);
         }
         cur = (cur - 1 + size) % size;
         *(int *) (solo_module->ctrl_bufs[rank]) =
             (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size;
-        //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
         mac_coll_solo_barrier_intra(comm, module);
 
     }
@@ -138,7 +148,6 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         memcpy((char *) c, data_bufs[i], seg_count * extent);
         c = c + seg_count * extent;
     }
-    //cur_win->w_osc_module->osc_fence(0, cur_win);
     mac_coll_solo_barrier_intra(comm, module);
     if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
         ;
@@ -155,11 +164,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
             data_bufs = NULL;
         }
 
-    } else {
-        //printf("TOO BIG\n");
     }
-
-
     return OMPI_SUCCESS;
 }
 
@@ -198,7 +203,23 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou
         data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent);
         cur_win = solo_module->dynamic_win;
     } else {
-        //printf("TOO BIG\n");
+        int seg_count = count;
+        size_t typelng;
+        ompi_datatype_type_size(dtype, &typelng);
+        COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count);
+        int num_segments = (count + seg_count - 1) / seg_count;
+        int last_count = count - seg_count * (num_segments - 1);
+        for (int i = 0; i < num_segments; i++) {
+            char *temp_sbuf = (char *)sbuf + seg_count * extent * i;
+            char *temp_rbuf = (char *)rbuf + seg_count * extent * i;
+            int temp_count = seg_count;
+            if (i == num_segments - 1) {
+                temp_count = last_count;
+            }
+            mca_coll_solo_allreduce_ring_intra_osc(temp_sbuf, temp_rbuf, temp_count, dtype, op, 
+                                                   comm, module);
+        }
+        return MPI_SUCCESS;
     }
 
     /* Set up segment count */
diff --git a/ompi/mca/coll/solo/coll_solo_bcast.c b/ompi/mca/coll/solo/coll_solo_bcast.c
index bcddb724d56..88d1cc12ee4 100644
--- a/ompi/mca/coll/solo/coll_solo_bcast.c
+++ b/ompi/mca/coll/solo/coll_solo_bcast.c
@@ -139,7 +139,7 @@ int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count,
         } else {
             mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs);
         }
-    } 
+    }
 
     return OMPI_SUCCESS;
 }
diff --git a/ompi/mca/coll/solo/coll_solo_reduce.c b/ompi/mca/coll/solo/coll_solo_reduce.c
index 1e2cab430ef..336ad845c57 100644
--- a/ompi/mca/coll/solo/coll_solo_reduce.c
+++ b/ompi/mca/coll/solo/coll_solo_reduce.c
@@ -43,11 +43,12 @@ int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count,
  * At last, root copies data back from the shared data buffer.
  */
 int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
-                                             int count,
-                                             struct ompi_datatype_t *dtype,
-                                             struct ompi_op_t *op,
-                                             int root, struct ompi_communicator_t
-                                             *comm, mca_coll_base_module_t * module)
+                                           int count,
+                                           struct ompi_datatype_t *dtype,
+                                           struct ompi_op_t *op,
+                                           int root, 
+                                           struct ompi_communicator_t *comm, 
+                                           mca_coll_base_module_t * module)
 {
     mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
     int size = ompi_comm_size(comm);
@@ -83,7 +84,23 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
                                                 count * extent);
         }
     } else {
-        //printf("TOO BIG\n");
+        int seg_count = count;
+        size_t typelng;
+        ompi_datatype_type_size(dtype, &typelng);
+        COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count);
+        int num_segments = (count + seg_count - 1) / seg_count;
+        int last_count = count - seg_count * (num_segments - 1);
+        for (int i = 0; i < num_segments; i++) {
+            char *temp_sbuf = (char *)sbuf + seg_count * extent * i;
+            char *temp_rbuf = (char *)rbuf + seg_count * extent * i;
+            int temp_count = seg_count;
+            if (i == num_segments - 1) {
+                temp_count = last_count;
+            }
+            mca_coll_solo_reduce_ring_intra_memcpy(temp_sbuf, temp_rbuf, temp_count, dtype, op, 
+                                                   root, comm, module);
+        }
+        return MPI_SUCCESS;
     }
 
     /* Set up segment count */
@@ -93,9 +110,7 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
     if (rank == size - 1) {
         seg_count = count - rank * l_seg_count;
     }
-    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
     *(int *) (solo_module->ctrl_bufs[rank]) = rank;
-    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
     mac_coll_solo_barrier_intra(comm, module);
 
     int cur = rank;
@@ -107,9 +122,7 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
         }
         /* At first iteration, copy local data to the solo data buffer */
         if (cur == rank) {
-            //cur_win->w_osc_module->osc_fence(0, cur_win);
             memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent);
-            //cur_win->w_osc_module->osc_fence(0, cur_win);
             mac_coll_solo_barrier_intra(comm, module);
 
         }
@@ -117,13 +130,11 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
         else {
             ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent,
                            data_bufs[cur], seg_count, dtype);
-            //cur_win->w_osc_modulbe->osc_fence(0,cur_win);
             mac_coll_solo_barrier_intra(comm, module);
         }
         cur = (cur - 1 + size) % size;
         *(int *) (solo_module->ctrl_bufs[rank]) =
             (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size;
-        //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
         mac_coll_solo_barrier_intra(comm, module);
 
     }
@@ -141,11 +152,9 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
             c = c + seg_count * extent;
         }
     }
-    //cur_win->w_osc_module->osc_fence(0, cur_win);
     mac_coll_solo_barrier_intra(comm, module);
-    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
-        ;
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+    if ((size_t) count * extent > mca_coll_solo_component.static_block_size && 
+        (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
         mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[rank],
                                      count * extent);
         if (ids != NULL) {
@@ -157,11 +166,8 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
             free(data_bufs);
             data_bufs = NULL;
         }
-    } else {
-        //printf("TOO BIG\n");
     }
 
-
     return OMPI_SUCCESS;
 }
 
@@ -201,7 +207,23 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf,
         data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent);
         cur_win = solo_module->dynamic_win;
     } else {
-        //printf("TOO BIG\n");
+        int seg_count = count;
+        size_t typelng;
+        ompi_datatype_type_size(dtype, &typelng);
+        COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count);
+        int num_segments = (count + seg_count - 1) / seg_count;
+        int last_count = count - seg_count * (num_segments - 1);
+        for (int i = 0; i < num_segments; i++) {
+            char *temp_sbuf = (char *)sbuf + seg_count * extent * i;
+            char *temp_rbuf = (char *)rbuf + seg_count * extent * i;
+            int temp_count = seg_count;
+            if (i == num_segments - 1) {
+                temp_count = last_count;
+            }
+            mca_coll_solo_reduce_ring_intra_osc(temp_sbuf, temp_rbuf, temp_count, dtype, op, 
+                                                root, comm, module);
+        }
+        return MPI_SUCCESS;
     }
 
     /* Set up segment count */
@@ -211,9 +233,7 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf,
     if (rank == size - 1) {
         seg_count = count - rank * l_seg_count;
     }
-    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
     *(int *) (solo_module->ctrl_bufs[rank]) = rank;
-    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
     mac_coll_solo_barrier_intra(comm, module);
 
     int cur = rank;
@@ -243,7 +263,6 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf,
         cur = (cur - 1 + size) % size;
         *(int *) (solo_module->ctrl_bufs[rank]) =
             (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size;
-        //solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win);
         mac_coll_solo_barrier_intra(comm, module);
 
     }
@@ -271,9 +290,6 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf,
     } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
         mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs);
         mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent);
-    } else {
-        //printf("TOO BIG\n");
     }
-
     return OMPI_SUCCESS;
 }

From a7d96d9251fa647b7e01ad80b900353d3ef7e5ce Mon Sep 17 00:00:00 2001
From: Xi Luo <dycz0fx@gmail.com>
Date: Thu, 17 Oct 2019 20:48:27 -0400
Subject: [PATCH 4/7] Fix comments  in coll_solo_allreduce.c

---
 ompi/mca/coll/solo/coll_solo_allreduce.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c
index 4eb4b31d6d2..7a3d7bb404e 100644
--- a/ompi/mca/coll/solo/coll_solo_allreduce.c
+++ b/ompi/mca/coll/solo/coll_solo_allreduce.c
@@ -229,9 +229,7 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou
     if (rank == size - 1) {
         seg_count = count - rank * l_seg_count;
     }
-    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
     *(int *) (solo_module->ctrl_bufs[rank]) = rank;
-    //solo_module->static_win->w_osc_module->osc_fence(0,solo_module->static_win);
     mac_coll_solo_barrier_intra(comm, module);
 
     int cur = rank;
@@ -261,7 +259,6 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou
         cur = (cur - 1 + size) % size;
         *(int *) (solo_module->ctrl_bufs[rank]) =
             (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size;
-        //solo_module->static_win->w_osc_module->osc_fence(0, solo_module->static_win);
         mac_coll_solo_barrier_intra(comm, module);
 
     }
@@ -287,9 +284,6 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou
     } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
         mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs);
         mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent);
-    } else {
-        //printf("TOO BIG\n");
-    }
-
+    } 
     return OMPI_SUCCESS;
 }

From b0ca414a22cef7178b43345dc69c4ab7364563ee Mon Sep 17 00:00:00 2001
From: Xi Luo <dycz0fx@gmail.com>
Date: Mon, 21 Oct 2019 14:23:51 -0400
Subject: [PATCH 5/7] Fix the memory allocation problem in reduce and
 allreduce. (should allocate l_seg_count instead of count)

---
 ompi/mca/coll/solo/coll_solo_allreduce.c | 64 +++++++++++++-----------
 ompi/mca/coll/solo/coll_solo_reduce.c    | 63 ++++++++++++-----------
 2 files changed, 68 insertions(+), 59 deletions(-)

diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c
index 7a3d7bb404e..7637b1cad29 100644
--- a/ompi/mca/coll/solo/coll_solo_allreduce.c
+++ b/ompi/mca/coll/solo/coll_solo_allreduce.c
@@ -60,15 +60,23 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         mca_coll_solo_lazy_enable(module, comm);
     }
 
+    /* Set up segment count */
+    int seg_count, l_seg_count;
+    seg_count = count / size;
+    l_seg_count = seg_count;
+    if (rank == size - 1) {
+        seg_count = count - rank * l_seg_count;
+    }
+
     char **data_bufs = NULL;
     int *ids = NULL;
-    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+    if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) {
         data_bufs = solo_module->data_bufs;
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+    } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
         data_bufs = (char **) malloc(sizeof(char *) * size);
         ids = (int *) malloc(sizeof(int) * size);
         ids[rank] =
-            mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
+            mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent);
 
         ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0,
                                                          MPI_DATATYPE_NULL,
@@ -79,9 +87,10 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         for (i = 0; i < size; i++) {
             data_bufs[i] =
                 mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i],
-                                                count * extent);
+                                              l_seg_count * extent);
         }
     } else {
+        /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */
         int seg_count = count;
         size_t typelng;
         ompi_datatype_type_size(dtype, &typelng);
@@ -101,13 +110,6 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         return MPI_SUCCESS;
     }
 
-    /* Set up segment count */
-    int seg_count, l_seg_count;
-    seg_count = count / size;
-    l_seg_count = seg_count;
-    if (rank == size - 1) {
-        seg_count = count - rank * l_seg_count;
-    }
     *(int *) (solo_module->ctrl_bufs[rank]) = rank;
     mac_coll_solo_barrier_intra(comm, module);
 
@@ -149,11 +151,10 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         c = c + seg_count * extent;
     }
     mac_coll_solo_barrier_intra(comm, module);
-    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
-        ;
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+    if ((size_t) l_seg_count * extent > mca_coll_solo_component.static_block_size && 
+        (size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
         mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[rank],
-                                     count * extent);
+                                   l_seg_count * extent);
         if (ids != NULL) {
             free(ids);
             ids = NULL;
@@ -185,24 +186,34 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou
     if (!solo_module->enabled) {
         mca_coll_solo_lazy_enable(module, comm);
     }
+
+    /* Set up segment count */
+    int seg_count, l_seg_count;
+    seg_count = count / size;
+    l_seg_count = seg_count;
+    if (rank == size - 1) {
+        seg_count = count - rank * l_seg_count;
+    }
+
     char **data_bufs = NULL;
     int id;
     MPI_Win cur_win;
     char *local_buf = NULL;
-    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+    if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) {
         data_bufs = (char **) malloc(sizeof(char *) * size);
         for (i = 0; i < size; i++) {
             data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size;
         }
         cur_win = solo_module->static_win;
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
-        id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
+    } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent);
         local_buf =
             mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id,
-                                            count * extent);
-        data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent);
+                                          l_seg_count * extent);
+        data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, l_seg_count * extent);
         cur_win = solo_module->dynamic_win;
     } else {
+        /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */
         int seg_count = count;
         size_t typelng;
         ompi_datatype_type_size(dtype, &typelng);
@@ -222,13 +233,6 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou
         return MPI_SUCCESS;
     }
 
-    /* Set up segment count */
-    int seg_count, l_seg_count;
-    seg_count = count / size;
-    l_seg_count = seg_count;
-    if (rank == size - 1) {
-        seg_count = count - rank * l_seg_count;
-    }
     *(int *) (solo_module->ctrl_bufs[rank]) = rank;
     mac_coll_solo_barrier_intra(comm, module);
 
@@ -276,14 +280,14 @@ int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int cou
         c = c + seg_count * extent;
     }
     cur_win->w_osc_module->osc_fence(0, cur_win);
-    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+    if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) {
         if (data_bufs != NULL) {
             free(data_bufs);
             data_bufs = NULL;
         }
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+    } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
         mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs);
-        mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent);
+        mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, l_seg_count * extent);
     } 
     return OMPI_SUCCESS;
 }
diff --git a/ompi/mca/coll/solo/coll_solo_reduce.c b/ompi/mca/coll/solo/coll_solo_reduce.c
index 336ad845c57..5ef92c960ba 100644
--- a/ompi/mca/coll/solo/coll_solo_reduce.c
+++ b/ompi/mca/coll/solo/coll_solo_reduce.c
@@ -57,6 +57,14 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
     ptrdiff_t extent, lower_bound;
     ompi_datatype_get_extent(dtype, &lower_bound, &extent);
 
+    /* Set up segment count */
+    int seg_count, l_seg_count;
+    seg_count = count / size;
+    l_seg_count = seg_count;
+    if (rank == size - 1) {
+        seg_count = count - rank * l_seg_count;
+    }
+
     /* Enable solo module if necessary */
     if (!solo_module->enabled) {
         mca_coll_solo_lazy_enable(module, comm);
@@ -64,13 +72,13 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
 
     char **data_bufs = NULL;
     int *ids = NULL;
-    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+    if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) {
         data_bufs = solo_module->data_bufs;
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+    } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
         data_bufs = (char **) malloc(sizeof(char *) * size);
         ids = (int *) malloc(sizeof(int) * size);
         ids[rank] =
-            mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
+            mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent);
 
         ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0,
                                                          MPI_DATATYPE_NULL,
@@ -81,9 +89,10 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
         for (i = 0; i < size; i++) {
             data_bufs[i] =
                 mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i],
-                                                count * extent);
+                                                l_seg_count * extent);
         }
     } else {
+        /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */
         int seg_count = count;
         size_t typelng;
         ompi_datatype_type_size(dtype, &typelng);
@@ -103,13 +112,6 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
         return MPI_SUCCESS;
     }
 
-    /* Set up segment count */
-    int seg_count, l_seg_count;
-    seg_count = count / size;
-    l_seg_count = seg_count;
-    if (rank == size - 1) {
-        seg_count = count - rank * l_seg_count;
-    }
     *(int *) (solo_module->ctrl_bufs[rank]) = rank;
     mac_coll_solo_barrier_intra(comm, module);
 
@@ -153,10 +155,10 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
         }
     }
     mac_coll_solo_barrier_intra(comm, module);
-    if ((size_t) count * extent > mca_coll_solo_component.static_block_size && 
-        (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+    if ((size_t) l_seg_count * extent > mca_coll_solo_component.static_block_size && 
+        (size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
         mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[rank],
-                                     count * extent);
+                                   l_seg_count * extent);
         if (ids != NULL) {
             free(ids);
             ids = NULL;
@@ -189,22 +191,31 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf,
     if (!solo_module->enabled) {
         mca_coll_solo_lazy_enable(module, comm);
     }
+
+    /* Set up segment count */
+    int seg_count, l_seg_count;
+    seg_count = count / size;
+    l_seg_count = seg_count;
+    if (rank == size - 1) {
+        seg_count = count - rank * l_seg_count;
+    }
+
     char **data_bufs = NULL;
     int id;
     MPI_Win cur_win;
     char *local_buf = NULL;
-    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+    if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) {
         data_bufs = (char **) malloc(sizeof(char *) * size);
         for (i = 0; i < size; i++) {
             data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size;
         }
         cur_win = solo_module->static_win;
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
-        id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
+    } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+        id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent);
         local_buf =
             mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id,
-                                            count * extent);
-        data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, count * extent);
+                                          l_seg_count * extent);
+        data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, l_seg_count * extent);
         cur_win = solo_module->dynamic_win;
     } else {
         int seg_count = count;
@@ -226,13 +237,7 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf,
         return MPI_SUCCESS;
     }
 
-    /* Set up segment count */
-    int seg_count, l_seg_count;
-    seg_count = count / size;
-    l_seg_count = seg_count;
-    if (rank == size - 1) {
-        seg_count = count - rank * l_seg_count;
-    }
+
     *(int *) (solo_module->ctrl_bufs[rank]) = rank;
     mac_coll_solo_barrier_intra(comm, module);
 
@@ -282,14 +287,14 @@ int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf,
         }
     }
     cur_win->w_osc_module->osc_fence(0, cur_win);
-    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
+    if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) {
         if (data_bufs != NULL) {
             free(data_bufs);
             data_bufs = NULL;
         }
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
+    } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
         mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs);
-        mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent);
+        mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, l_seg_count * extent);
     }
     return OMPI_SUCCESS;
 }

From f6466bcc19bc5ea31b93027e71d36dcc4ffebeb5 Mon Sep 17 00:00:00 2001
From: Xi Luo <dycz0fx@gmail.com>
Date: Tue, 26 Nov 2019 11:39:25 -0500
Subject: [PATCH 6/7] Redo the non contiguous datatype part in SOLO module.
 Known problems: 1. bcast: has bug if a datatype is from MPI_Bottom 2. reduce
 \ allreduce : fix or remove the non contiguous support.

---
 ompi/mca/coll/solo/coll_solo.h           |  81 +++++-----
 ompi/mca/coll/solo/coll_solo_allreduce.c | 143 +----------------
 ompi/mca/coll/solo/coll_solo_bcast.c     | 187 +----------------------
 ompi/mca/coll/solo/coll_solo_module.c    |  48 ------
 ompi/mca/coll/solo/coll_solo_reduce.c    | 150 +-----------------
 5 files changed, 53 insertions(+), 556 deletions(-)

diff --git a/ompi/mca/coll/solo/coll_solo.h b/ompi/mca/coll/solo/coll_solo.h
index 83c7e040d02..0e378275ba4 100644
--- a/ompi/mca/coll/solo/coll_solo.h
+++ b/ompi/mca/coll/solo/coll_solo.h
@@ -59,12 +59,6 @@ typedef struct mca_coll_solo_module_t {
     /* Whether this module has been lazily initialized or not yet */
     bool enabled;
 
-    /**
-     * osc alrogithms attach memory blocks to this bynamic window and use it to perform one-sided 
-     * communications. 
-     */
-    MPI_Win dynamic_win;
-
     /**
      * This window is created by ompi_win_allocate_shared such that each process contains a shared 
      * memory data buffer, and this data buffer is divided into two parts - ctrl_bufs and data_bufs.
@@ -101,18 +95,6 @@ mca_coll_base_module_t *mca_coll_solo_comm_query(struct ompi_communicator_t *com
 /* Lazily enable a module (since it involves expensive memory allocation, etc.) */
 int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm);
 
-/* Attach a memory block to the dynamic_win of a communicator */
-char **mca_coll_solo_attach_buf(mca_coll_solo_module_t * solo_module,
-                                struct ompi_communicator_t *comm,
-                                char *local_buf, 
-                                size_t local_buf_size);
-
-/* Detach a memory block from the dynamic_win of a communicator */
-void mca_coll_solo_detach_buf(mca_coll_solo_module_t * solo_module,
-                              struct ompi_communicator_t *comm,
-                              char *local_buf, 
-                              char ***attached_bufs);
-
 /* Setup and initialize the static_win of a communicator */
 void mca_coll_solo_setup_static_win(mca_coll_solo_module_t *solo_module,
                                     struct ompi_communicator_t *comm, 
@@ -135,12 +117,6 @@ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count,
                                             struct ompi_communicator_t *comm, 
                                             mca_coll_base_module_t * module);
 
-int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count,
-                                         struct ompi_datatype_t *dtype,
-                                         int root, 
-                                         struct ompi_communicator_t *comm, 
-                                         mca_coll_base_module_t * module);
-
 int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count,
                                               struct ompi_datatype_t *dtype, 
                                               int root,
@@ -148,13 +124,6 @@ int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count,
                                               mca_coll_base_module_t * module,
                                               size_t seg_size);
 
-int mca_coll_solo_bcast_pipeline_intra_osc(void *buff, int count,
-                                           struct ompi_datatype_t *dtype, 
-                                           int root,
-                                           struct ompi_communicator_t *comm, 
-                                           mca_coll_base_module_t * module,
-                                           size_t seg_size);
-
 /* MPI_Reduce algorithms */
 int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count,
                                struct ompi_datatype_t *dtype,
@@ -163,12 +132,6 @@ int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count,
                                struct ompi_communicator_t *comm, 
                                mca_coll_base_module_t * module);
 
-int mca_coll_solo_reduce_ring_intra(const void *sbuf, void *rbuf, int count,
-                                    struct ompi_datatype_t *dtype,
-                                    struct ompi_op_t *op, int root,
-                                    struct ompi_communicator_t *comm,
-                                    mca_coll_base_module_t * module);
-
 int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int count,
                                            struct ompi_datatype_t *dtype,
                                            struct ompi_op_t *op,
@@ -176,11 +139,6 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int cou
                                            struct ompi_communicator_t
                                            *comm, mca_coll_base_module_t * module);
 
-int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf, int count,
-                                        struct ompi_datatype_t *dtype,
-                                        struct ompi_op_t *op, int root,
-                                        struct ompi_communicator_t *comm,
-                                        mca_coll_base_module_t * module);
 
 /* MPI_Allreduce algorithms */
 int mca_coll_solo_allreduce_intra(const void *sbuf, void *rbuf, int count,
@@ -195,10 +153,39 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
                                               struct ompi_communicator_t *comm, 
                                               mca_coll_base_module_t * module);
 
-int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int count, 
-                                           struct ompi_datatype_t *dtype, 
-                                           struct ompi_op_t *op, 
-                                           struct ompi_communicator_t *comm, 
-                                           mca_coll_base_module_t * module);
+
+/* Solo pack to shared memory */
+static inline void mca_coll_solo_pack_to_shared(void *local_buf, void *shared_buf, struct ompi_datatype_t *dtype, int count, ptrdiff_t extent) {
+    if (ompi_datatype_is_predefined(dtype)) {
+        memcpy((char *) shared_buf, (char *) local_buf, count * extent);
+    }
+    else {
+        MPI_Aint pos = 0;
+        ompi_datatype_pack_external("external32", local_buf, count, dtype, shared_buf, count * extent, &pos);
+    }
+}
+
+/* Solo unpack from shared memory */
+static inline void mca_coll_solo_unpack_from_shared(void *local_buf, void *shared_buf, struct ompi_datatype_t *dtype, int count, ptrdiff_t extent) {
+    if (ompi_datatype_is_predefined(dtype)) {
+        memcpy((char *) local_buf, (char *) shared_buf, count * extent);
+    }
+    else {
+        MPI_Aint pos = 0;
+        ompi_datatype_unpack_external("external32", shared_buf, count * extent, &pos, local_buf, count, dtype);
+    }
+}
+
+/* Solo copy from source to target */
+static inline void mca_coll_solo_copy(void *source, void *target, struct ompi_datatype_t *dtype, int count, ptrdiff_t extent) {
+    if (ompi_datatype_is_predefined(dtype)) {
+        memcpy(target, source, count * extent);
+    }
+    else {
+        ompi_datatype_copy_content_same_ddt(dtype, count, target, source);
+    }
+    return;
+} 
+
 END_C_DECLS
 #endif                          /* MCA_COLL_SOLO_EXPORT_H */
diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c
index 7637b1cad29..446495afa68 100644
--- a/ompi/mca/coll/solo/coll_solo_allreduce.c
+++ b/ompi/mca/coll/solo/coll_solo_allreduce.c
@@ -18,14 +18,7 @@ int mca_coll_solo_allreduce_intra(const void *sbuf, void *rbuf,
                                     struct ompi_communicator_t *comm,
                                     mca_coll_base_module_t * module)
 {
-    if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) {
-        mca_coll_solo_allreduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, comm, module);
-    }
-    else {
-        mca_coll_solo_allreduce_ring_intra_osc(sbuf, rbuf, count, dtype, op, comm, module);
-    }
-    return OMPI_SUCCESS;
-
+    return mca_coll_solo_allreduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, comm, module);
 }
 
 
@@ -91,10 +84,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         }
     } else {
         /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */
-        int seg_count = count;
-        size_t typelng;
-        ompi_datatype_type_size(dtype, &typelng);
-        COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count);
+        int seg_count = mca_coll_solo_component.mpool_large_block_size / extent;
         int num_segments = (count + seg_count - 1) / seg_count;
         int last_count = count - seg_count * (num_segments - 1);
         for (int i = 0; i < num_segments; i++) {
@@ -122,7 +112,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         }
         /* At first iteration, copy local data to the solo data buffer */
         if (cur == rank) {
-            memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent);
+            mca_coll_solo_copy((void *) ((char *) sbuf + cur * l_seg_count * extent), (void *) data_bufs[cur], dtype, seg_count, extent);
             mac_coll_solo_barrier_intra(comm, module);
 
         }
@@ -147,7 +137,7 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         } else {
             seg_count = count - i * l_seg_count;
         }
-        memcpy((char *) c, data_bufs[i], seg_count * extent);
+        mca_coll_solo_copy((void *) data_bufs[i], (void *) c, dtype, seg_count, extent);
         c = c + seg_count * extent;
     }
     mac_coll_solo_barrier_intra(comm, module);
@@ -167,127 +157,4 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
 
     }
     return OMPI_SUCCESS;
-}
-
-int mca_coll_solo_allreduce_ring_intra_osc(const void *sbuf, void *rbuf, int count,
-                                             struct ompi_datatype_t *dtype,
-                                             struct ompi_op_t *op, 
-                                             struct ompi_communicator_t *comm, 
-                                             mca_coll_base_module_t * module)
-{
-    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
-    int size = ompi_comm_size(comm);
-    int rank = ompi_comm_rank(comm);
-    int i;
-    ptrdiff_t extent, lower_bound;
-    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
-
-    /* Enable solo module if necessary */
-    if (!solo_module->enabled) {
-        mca_coll_solo_lazy_enable(module, comm);
-    }
-
-    /* Set up segment count */
-    int seg_count, l_seg_count;
-    seg_count = count / size;
-    l_seg_count = seg_count;
-    if (rank == size - 1) {
-        seg_count = count - rank * l_seg_count;
-    }
-
-    char **data_bufs = NULL;
-    int id;
-    MPI_Win cur_win;
-    char *local_buf = NULL;
-    if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) {
-        data_bufs = (char **) malloc(sizeof(char *) * size);
-        for (i = 0; i < size; i++) {
-            data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size;
-        }
-        cur_win = solo_module->static_win;
-    } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
-        id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent);
-        local_buf =
-            mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id,
-                                          l_seg_count * extent);
-        data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, l_seg_count * extent);
-        cur_win = solo_module->dynamic_win;
-    } else {
-        /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */
-        int seg_count = count;
-        size_t typelng;
-        ompi_datatype_type_size(dtype, &typelng);
-        COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count);
-        int num_segments = (count + seg_count - 1) / seg_count;
-        int last_count = count - seg_count * (num_segments - 1);
-        for (int i = 0; i < num_segments; i++) {
-            char *temp_sbuf = (char *)sbuf + seg_count * extent * i;
-            char *temp_rbuf = (char *)rbuf + seg_count * extent * i;
-            int temp_count = seg_count;
-            if (i == num_segments - 1) {
-                temp_count = last_count;
-            }
-            mca_coll_solo_allreduce_ring_intra_osc(temp_sbuf, temp_rbuf, temp_count, dtype, op, 
-                                                   comm, module);
-        }
-        return MPI_SUCCESS;
-    }
-
-    *(int *) (solo_module->ctrl_bufs[rank]) = rank;
-    mac_coll_solo_barrier_intra(comm, module);
-
-    int cur = rank;
-    for (i = 0; i < size; i++) {
-        if (cur != size - 1) {
-            seg_count = l_seg_count;
-        } else {
-            seg_count = count - cur * l_seg_count;
-        }
-        /* At first iteration, copy local data to the solo data buffer */
-        if (cur == rank) {
-            cur_win->w_osc_module->osc_fence(0, cur_win);
-            cur_win->w_osc_module->osc_put((char *) sbuf +
-                                           cur * l_seg_count * extent,
-                                           seg_count, dtype, cur,
-                                           (ptrdiff_t) data_bufs[cur], seg_count, dtype, cur_win);
-            cur_win->w_osc_module->osc_fence(0, cur_win);
-        }
-        /* For other iterations, do operations on the solo data buffer */
-        else {
-            cur_win->w_osc_module->osc_accumulate((char *) sbuf +
-                                                  cur * l_seg_count *
-                                                  extent, seg_count, dtype, cur, (ptrdiff_t)
-                                                  data_bufs[cur], seg_count, dtype, op, cur_win);
-            cur_win->w_osc_module->osc_fence(0, cur_win);
-        }
-        cur = (cur - 1 + size) % size;
-        *(int *) (solo_module->ctrl_bufs[rank]) =
-            (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size;
-        mac_coll_solo_barrier_intra(comm, module);
-
-    }
-    /* At last, all the processes copies data from the solo data buffer */
-    char *c;
-    c = rbuf;
-    for (i = 0; i < size; i++) {
-        if (i != size - 1) {
-            seg_count = l_seg_count;
-        } else {
-            seg_count = count - i * l_seg_count;
-        }
-        cur_win->w_osc_module->osc_get(c, seg_count, dtype, i,
-                                       (ptrdiff_t) data_bufs[i], seg_count, dtype, cur_win);
-        c = c + seg_count * extent;
-    }
-    cur_win->w_osc_module->osc_fence(0, cur_win);
-    if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) {
-        if (data_bufs != NULL) {
-            free(data_bufs);
-            data_bufs = NULL;
-        }
-    } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
-        mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs);
-        mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, l_seg_count * extent);
-    } 
-    return OMPI_SUCCESS;
-}
+}
\ No newline at end of file
diff --git a/ompi/mca/coll/solo/coll_solo_bcast.c b/ompi/mca/coll/solo/coll_solo_bcast.c
index 88d1cc12ee4..55224d846bf 100644
--- a/ompi/mca/coll/solo/coll_solo_bcast.c
+++ b/ompi/mca/coll/solo/coll_solo_bcast.c
@@ -17,13 +17,7 @@ int mca_coll_solo_bcast_intra(void *buff, int count,
                               struct ompi_communicator_t *comm, 
                               mca_coll_base_module_t * module)
 {
-    if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) {
-        mca_coll_solo_bcast_linear_intra_memcpy(buff, count, dtype, root, comm, module);
-    }
-    else {
-        mca_coll_solo_bcast_linear_intra_osc(buff, count, dtype, root, comm, module);
-    }
-    return OMPI_SUCCESS;
+    return mca_coll_solo_bcast_linear_intra_memcpy(buff, count, dtype, root, comm, module);
 }
 
 /* linear bcast with memcpy */
@@ -61,12 +55,12 @@ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count,
 
     /* Root copy data to the shared memory block */
     if (rank == root) {
-        memcpy(data_buf, (char *) buff, count * extent);
+        mca_coll_solo_pack_to_shared(buff, (void *) data_buf, dtype, count, extent);
     }
     mac_coll_solo_barrier_intra(comm, module);
     /* Other processes copy data from the shared memory block */
     if (rank != root) {
-        memcpy((char *) buff, data_buf, count * extent);
+        mca_coll_solo_unpack_from_shared(buff, (void *) data_buf, dtype, count, extent);
     }
     mac_coll_solo_barrier_intra(comm, module);
     if ((size_t) count * extent > mca_coll_solo_component.static_block_size &&
@@ -78,72 +72,6 @@ int mca_coll_solo_bcast_linear_intra_memcpy(void *buff, int count,
     return OMPI_SUCCESS;
 }
 
-int mca_coll_solo_bcast_linear_intra_osc(void *buff, int count,
-                                         struct ompi_datatype_t *dtype, 
-                                         int root,
-                                         struct ompi_communicator_t *comm, 
-                                         mca_coll_base_module_t * module)
-{
-    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
-
-    int rank = ompi_comm_rank(comm);
-    ptrdiff_t extent, lower_bound;
-    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
-    /* Enable solo module if necessary */
-    if (!solo_module->enabled) {
-        mca_coll_solo_lazy_enable(module, comm);
-    }
-    /* Init the data_buf - shared among all the processes */
-    int id = 0;
-    char **attached_bufs = NULL;
-    MPI_Win cur_win;
-    char *data_buf;
-    if ((size_t) count * extent <= mca_coll_solo_component.static_block_size) {
-        data_buf = (char *) 0 + 4 * opal_cache_line_size;
-        cur_win = solo_module->static_win;
-    } else if ((size_t) count * extent <= mca_coll_solo_component.mpool_small_block_size) {
-        if (rank == root) {
-            id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, count * extent);
-            data_buf = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id,
-                                                     count * extent);
-            attached_bufs = mca_coll_solo_attach_buf(solo_module, comm, data_buf, count * extent);
-        } else {
-            attached_bufs = mca_coll_solo_attach_buf(solo_module, comm, NULL, 0);
-        }
-        data_buf = attached_bufs[root];
-        cur_win = solo_module->dynamic_win;
-    } else {
-        return mca_coll_solo_bcast_pipeline_intra_osc(buff, count, dtype, root, comm, module, 
-                                                      mca_coll_solo_component.mpool_small_block_size);
-    }
-
-    /* Root copy to shared memory */
-    cur_win->w_osc_module->osc_fence(0, cur_win);
-    if (rank == root) {
-        cur_win->w_osc_module->osc_put(buff, count, dtype, root, (ptrdiff_t) data_buf, count, dtype, 
-                                       cur_win);
-    }
-    cur_win->w_osc_module->osc_fence(0, cur_win);
-    /* Other processes copy data from shared memory */
-    if (rank != root) {
-        cur_win->w_osc_module->osc_get(buff, count, dtype, root, (ptrdiff_t) data_buf, count, dtype,
-                                       cur_win);
-    }
-    cur_win->w_osc_module->osc_fence(0, cur_win);
-
-    if ((size_t) count * extent > mca_coll_solo_component.static_block_size &&
-        (size_t) count * extent <= mca_coll_solo_component.mpool_large_block_size) {
-        if (rank == root) {
-            mca_coll_solo_detach_buf(solo_module, comm, data_buf, &attached_bufs);
-            mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, count * extent);
-        } else {
-            mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs);
-        }
-    }
-
-    return OMPI_SUCCESS;
-}
-
 int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count,
                                               struct ompi_datatype_t *dtype, 
                                               int root,
@@ -175,10 +103,7 @@ int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count,
                                                      seg_size);
     }
     
-    int seg_count = count;
-    size_t typelng;
-    ompi_datatype_type_size(dtype, &typelng);
-    COLL_BASE_COMPUTED_SEGCOUNT(seg_size, typelng, seg_count);
+    int seg_count = seg_size / extent;
     int num_segments = (count + seg_count - 1) / seg_count;
     int last_count = count - seg_count * (num_segments - 1);
 
@@ -188,12 +113,12 @@ int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count,
         if (i == 0) {
             /* In the first iteration, root copies data to the current shared memory block */
             if (rank == root) {
-                memcpy(data_bufs[cur], (char *) buff, seg_count * extent);
+                mca_coll_solo_pack_to_shared(buff, (void *) data_bufs[cur], dtype, seg_count, extent);
             }
         }
         else if ( i == num_segments) {
             /* In the last iteration, other processes copy data from the previous shared memory block */
-            memcpy(((char *) buff) + seg_count * extent * (i - 1), data_bufs[pre], last_count * extent);
+            mca_coll_solo_unpack_from_shared(((char *) buff) + seg_count * extent * (i - 1), (void *) data_bufs[pre], dtype, last_count, extent);
         }
         else {
             /** 
@@ -205,10 +130,10 @@ int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count,
                 if ( i == num_segments - 1) {
                     temp_count = last_count;
                 }
-                memcpy(data_bufs[cur], ((char *) buff) + seg_count * extent * i, temp_count * extent);
+                mca_coll_solo_pack_to_shared(((char *) buff) + seg_count * extent * i, data_bufs[cur], dtype, temp_count, extent);
             }
             else {
-                memcpy(((char *) buff) + seg_count * extent * (i - 1), data_bufs[pre], seg_count * extent);
+                mca_coll_solo_unpack_from_shared(((char *) buff) + seg_count * extent * (i - 1), (void *) data_bufs[pre], dtype, seg_count, extent);
             }
         }
         mac_coll_solo_barrier_intra(comm, module);
@@ -223,99 +148,3 @@ int mca_coll_solo_bcast_pipeline_intra_memcpy(void *buff, int count,
     
     return OMPI_SUCCESS;
 }
-
-int mca_coll_solo_bcast_pipeline_intra_osc(void *buff, int count,
-                                           struct ompi_datatype_t *dtype, 
-                                           int root,
-                                           struct ompi_communicator_t *comm, 
-                                           mca_coll_base_module_t * module,
-                                           size_t seg_size) 
-{
-    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
-
-    int rank = ompi_comm_rank(comm);
-    ptrdiff_t extent, lower_bound;
-    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
-    /* Enable solo module if necessary */
-    if (!solo_module->enabled) {
-        mca_coll_solo_lazy_enable(module, comm);
-    }
-    /* Init the data_bufs - shared among all the processes, needs two for the pipelining */
-    int ids[2];
-    char **attached_bufs[2];
-    MPI_Win cur_win = solo_module->dynamic_win;
-    char *data_bufs[2];
-    int i;
-    for (i = 0; i < 2; i++) {
-        if (rank == root) {
-            ids[i] = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, seg_size);
-            data_bufs[i] = mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, ids[i],
-                                                         seg_size);
-            attached_bufs[i] = mca_coll_solo_attach_buf(solo_module, comm, data_bufs[i], seg_size);
-        }
-        else {
-            attached_bufs[i] = mca_coll_solo_attach_buf(solo_module, comm, NULL, 0);
-        }
-        data_bufs[i] = attached_bufs[i][root];
-    }
-    
-    int seg_count = count;
-    size_t typelng;
-    ompi_datatype_type_size(dtype, &typelng);
-    COLL_BASE_COMPUTED_SEGCOUNT(seg_size, typelng, seg_count);
-    int num_segments = (count + seg_count - 1) / seg_count;
-    int last_count = count - seg_count * (num_segments - 1);
-
-    cur_win->w_osc_module->osc_fence(0, cur_win);
-    for (i = 0; i <= num_segments; i++) {
-        int cur = i & 1;
-        int pre = !cur;
-        if (i == 0) {
-            /* In the first iteration, root copies data to the current shared memory block */
-            if (rank == root) {
-                cur_win->w_osc_module->osc_put(buff, seg_count, dtype, root, (ptrdiff_t) data_bufs[cur],
-                                               seg_count, dtype, cur_win);
-            }
-        }
-        else if ( i == num_segments) {
-            /* In the last iteration, other processes copy data from the previous shared memory block */
-            cur_win->w_osc_module->osc_get(((char *) buff) + seg_count * extent * (i - 1), 
-                                           last_count, dtype, root, (ptrdiff_t) data_bufs[pre], 
-                                           last_count, dtype, cur_win);
-        }
-        else {
-            /** 
-             * For other iterations, root copies data to the current shared memory block and 
-             * other proceeses copy data from the previous shared memory block.
-             */
-            if (rank == root) {
-                int temp_count = seg_count;
-                if ( i == num_segments - 1) {
-                    temp_count = last_count;
-                }
-                cur_win->w_osc_module->osc_put(((char *) buff) + seg_count * extent * i, 
-                                               temp_count, dtype, root, (ptrdiff_t) data_bufs[cur],
-                                               temp_count, dtype, cur_win);
-            }
-            else {
-                cur_win->w_osc_module->osc_get(((char *) buff) + seg_count * extent * (i - 1), 
-                                               seg_count, dtype, root, (ptrdiff_t) data_bufs[pre], 
-                                               seg_count, dtype, cur_win);
-            }
-        }
-        cur_win->w_osc_module->osc_fence(0, cur_win);
-    }
-
-    /* Return the data_bufs */
-    for (i = 0; i < 2; i++) {
-        if (rank == root) {
-            mca_coll_solo_detach_buf(solo_module, comm, data_bufs[i], &attached_bufs[i]);
-            mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, ids[i], seg_size);
-        }
-        else {
-            mca_coll_solo_detach_buf(solo_module, comm, NULL, &attached_bufs[i]);
-        }
-    }
-    
-    return OMPI_SUCCESS;
-}
diff --git a/ompi/mca/coll/solo/coll_solo_module.c b/ompi/mca/coll/solo/coll_solo_module.c
index 0b59625a0c1..4b07ed4132f 100644
--- a/ompi/mca/coll/solo/coll_solo_module.c
+++ b/ompi/mca/coll/solo/coll_solo_module.c
@@ -53,7 +53,6 @@ static int mca_coll_solo_module_disable(mca_coll_base_module_t * module,
 static void mca_coll_solo_module_construct(mca_coll_solo_module_t * module)
 {
     module->enabled = false;
-    module->dynamic_win = NULL;
     module->static_win = NULL;
     module->ctrl_bufs = NULL;
     module->data_bufs = NULL;
@@ -82,9 +81,6 @@ static int mca_coll_solo_module_disable(mca_coll_base_module_t * module,
     //     int rank = ompi_comm_rank(comm);
 
     //     /* Free the windows */
-    //     if (m->dynamic_win != NULL) {
-    //         ompi_win_free(m->dynamic_win);
-    //     }
     //     if (m->static_win != NULL) {
     //         ompi_win_free(m->static_win);
     //     }
@@ -225,10 +221,6 @@ int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_commu
         mca_coll_solo_component.solo_mpool = OBJ_NEW(mca_coll_solo_mpool_t);
     }
 
-    /* Create the dynamic_win */
-    ompi_win_create_dynamic((opal_info_t *) (&ompi_mpi_info_null), comm,
-                            &solo_module->dynamic_win);
-
     /* Create the static_win with shared memory allocation */
     mca_coll_solo_setup_static_win(solo_module, comm,
                                    mca_coll_solo_component.static_block_size);
@@ -241,46 +233,6 @@ int mca_coll_solo_lazy_enable(mca_coll_base_module_t * module, struct ompi_commu
     return OMPI_SUCCESS;
 }
 
-/**
- * Attach a memory block to the dynamic_win of a communicator, returns an array contains the 
- * addresses of all the blocks of the processes in the communicator.
- * local_buf == NULL and local_buf_size == 0 means there is no block to be attached on this process.
- */
-char **mca_coll_solo_attach_buf(mca_coll_solo_module_t * solo_module,
-                                  struct ompi_communicator_t *comm,
-                                  char *local_buf, size_t local_buf_size)
-{
-    int rank = ompi_comm_rank(comm);
-    int size = ompi_comm_size(comm);
-
-    char **attached_bufs = (char **) malloc(sizeof(char *) * size);
-    attached_bufs[rank] = local_buf;
-    ompi_coll_base_allgather_intra_recursivedoubling(MPI_IN_PLACE, 0,
-                                                     MPI_DATATYPE_NULL,
-                                                     attached_bufs,
-                                                     1, MPI_AINT, comm,
-                                                     (mca_coll_base_module_t *) solo_module);
-
-    solo_module->dynamic_win->w_osc_module->osc_win_attach(solo_module->dynamic_win, local_buf,
-                                                             local_buf_size);
-
-    return attached_bufs;
-}
-
-/* Detach a memory block from the dynamic_win of a communicator */
-void mca_coll_solo_detach_buf(mca_coll_solo_module_t * solo_module,
-                                struct ompi_communicator_t *comm,
-                                char *local_buf, char ***attached_bufs)
-{
-    if (local_buf != NULL) {
-        solo_module->dynamic_win->w_osc_module->osc_win_detach(solo_module->dynamic_win, local_buf);
-    }
-
-    free(*attached_bufs);
-    *attached_bufs = NULL;
-    return;
-}
-
 /* Setup and initialize the static_win of a communicator */
 void mca_coll_solo_setup_static_win(mca_coll_solo_module_t * solo_module,
                                     struct ompi_communicator_t *comm, size_t data_buf_size)
diff --git a/ompi/mca/coll/solo/coll_solo_reduce.c b/ompi/mca/coll/solo/coll_solo_reduce.c
index 5ef92c960ba..0d4cbc4b612 100644
--- a/ompi/mca/coll/solo/coll_solo_reduce.c
+++ b/ompi/mca/coll/solo/coll_solo_reduce.c
@@ -18,15 +18,7 @@ int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count,
                                  struct ompi_communicator_t *comm, 
                                  mca_coll_base_module_t * module)
 {
-    if (ompi_datatype_is_contiguous_memory_layout(dtype, count)) {
-        mca_coll_solo_reduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, root, comm, module);
-    }
-    else {
-        mca_coll_solo_reduce_ring_intra_osc(sbuf, rbuf, count, dtype, op, root, comm, module);
-
-    }
-    return OMPI_SUCCESS;
-
+    return mca_coll_solo_reduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, root, comm, module);
 }
 
 /**
@@ -93,10 +85,7 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
         }
     } else {
         /* For the messages which are greater than mpool_large_block_size*np, invoke this reduce multiple times */
-        int seg_count = count;
-        size_t typelng;
-        ompi_datatype_type_size(dtype, &typelng);
-        COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count);
+        int seg_count = mca_coll_solo_component.mpool_large_block_size / extent;
         int num_segments = (count + seg_count - 1) / seg_count;
         int last_count = count - seg_count * (num_segments - 1);
         for (int i = 0; i < num_segments; i++) {
@@ -122,13 +111,12 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
         } else {
             seg_count = count - cur * l_seg_count;
         }
-        /* At first iteration, copy local data to the solo data buffer */
+        /* At first iteration, copy local data to the shared data buffer */
         if (cur == rank) {
-            memcpy(data_bufs[cur], (char *) sbuf + cur * l_seg_count * extent, seg_count * extent);
+            mca_coll_solo_copy((void *) ((char *) sbuf + cur * l_seg_count * extent), (void *) data_bufs[cur], dtype, seg_count, extent);
             mac_coll_solo_barrier_intra(comm, module);
-
         }
-        /* For other iterations, do operations on the solo data buffer */
+        /* For other iterations, do operations on the shared data buffer */
         else {
             ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent,
                            data_bufs[cur], seg_count, dtype);
@@ -150,7 +138,7 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
             } else {
                 seg_count = count - i * l_seg_count;
             }
-            memcpy((char *) c, data_bufs[i], seg_count * extent);
+            mca_coll_solo_copy((void *) data_bufs[i], (void *) c, dtype, seg_count, extent);
             c = c + seg_count * extent;
         }
     }
@@ -172,129 +160,3 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
 
     return OMPI_SUCCESS;
 }
-
-int mca_coll_solo_reduce_ring_intra_osc(const void *sbuf, void *rbuf,
-                                          int count,
-                                          struct ompi_datatype_t *dtype,
-                                          struct ompi_op_t *op, int root,
-                                          struct ompi_communicator_t *comm,
-                                          mca_coll_base_module_t * module)
-{
-    mca_coll_solo_module_t *solo_module = (mca_coll_solo_module_t *) module;
-    int size = ompi_comm_size(comm);
-    int rank = ompi_comm_rank(comm);
-    int i;
-    ptrdiff_t extent, lower_bound;
-    ompi_datatype_get_extent(dtype, &lower_bound, &extent);
-
-    /* Enable solo module if necessary */
-    if (!solo_module->enabled) {
-        mca_coll_solo_lazy_enable(module, comm);
-    }
-
-    /* Set up segment count */
-    int seg_count, l_seg_count;
-    seg_count = count / size;
-    l_seg_count = seg_count;
-    if (rank == size - 1) {
-        seg_count = count - rank * l_seg_count;
-    }
-
-    char **data_bufs = NULL;
-    int id;
-    MPI_Win cur_win;
-    char *local_buf = NULL;
-    if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) {
-        data_bufs = (char **) malloc(sizeof(char *) * size);
-        for (i = 0; i < size; i++) {
-            data_bufs[i] = (char *) 0 + 4 * opal_cache_line_size;
-        }
-        cur_win = solo_module->static_win;
-    } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
-        id = mca_coll_solo_mpool_request(mca_coll_solo_component.solo_mpool, l_seg_count * extent);
-        local_buf =
-            mca_coll_solo_mpool_calculate(mca_coll_solo_component.solo_mpool, id,
-                                          l_seg_count * extent);
-        data_bufs = mca_coll_solo_attach_buf(solo_module, comm, local_buf, l_seg_count * extent);
-        cur_win = solo_module->dynamic_win;
-    } else {
-        int seg_count = count;
-        size_t typelng;
-        ompi_datatype_type_size(dtype, &typelng);
-        COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_solo_component.mpool_large_block_size, typelng, seg_count);
-        int num_segments = (count + seg_count - 1) / seg_count;
-        int last_count = count - seg_count * (num_segments - 1);
-        for (int i = 0; i < num_segments; i++) {
-            char *temp_sbuf = (char *)sbuf + seg_count * extent * i;
-            char *temp_rbuf = (char *)rbuf + seg_count * extent * i;
-            int temp_count = seg_count;
-            if (i == num_segments - 1) {
-                temp_count = last_count;
-            }
-            mca_coll_solo_reduce_ring_intra_osc(temp_sbuf, temp_rbuf, temp_count, dtype, op, 
-                                                root, comm, module);
-        }
-        return MPI_SUCCESS;
-    }
-
-
-    *(int *) (solo_module->ctrl_bufs[rank]) = rank;
-    mac_coll_solo_barrier_intra(comm, module);
-
-    int cur = rank;
-    for (i = 0; i < size; i++) {
-        if (cur != size - 1) {
-            seg_count = l_seg_count;
-        } else {
-            seg_count = count - cur * l_seg_count;
-        }
-        /* At first iteration, copy local data to the solo data buffer */
-        if (cur == rank) {
-            cur_win->w_osc_module->osc_fence(0, cur_win);
-            cur_win->w_osc_module->osc_put((char *) sbuf +
-                                           cur * l_seg_count * extent,
-                                           seg_count, dtype, cur,
-                                           (ptrdiff_t) data_bufs[cur], seg_count, dtype, cur_win);
-            cur_win->w_osc_module->osc_fence(0, cur_win);
-        }
-        /* For other iterations, do operations on the solo data buffer */
-        else {
-            cur_win->w_osc_module->osc_accumulate((char *) sbuf +
-                                                  cur * l_seg_count *
-                                                  extent, seg_count, dtype, cur, (ptrdiff_t)
-                                                  data_bufs[cur], seg_count, dtype, op, cur_win);
-            cur_win->w_osc_module->osc_fence(0, cur_win);
-        }
-        cur = (cur - 1 + size) % size;
-        *(int *) (solo_module->ctrl_bufs[rank]) =
-            (*(int *) (solo_module->ctrl_bufs[rank]) + 1) % size;
-        mac_coll_solo_barrier_intra(comm, module);
-
-    }
-    /* At last, root copies data from the solo data buffer */
-    if (rank == root) {
-        char *c;
-        c = rbuf;
-        for (i = 0; i < size; i++) {
-            if (i != size - 1) {
-                seg_count = l_seg_count;
-            } else {
-                seg_count = count - i * l_seg_count;
-            }
-            cur_win->w_osc_module->osc_get(c, seg_count, dtype, i, (ptrdiff_t)
-                                           data_bufs[i], seg_count, dtype, cur_win);
-            c = c + seg_count * extent;
-        }
-    }
-    cur_win->w_osc_module->osc_fence(0, cur_win);
-    if ((size_t) l_seg_count * extent <= mca_coll_solo_component.static_block_size) {
-        if (data_bufs != NULL) {
-            free(data_bufs);
-            data_bufs = NULL;
-        }
-    } else if ((size_t) l_seg_count * extent <= mca_coll_solo_component.mpool_large_block_size) {
-        mca_coll_solo_detach_buf(solo_module, comm, local_buf, &data_bufs);
-        mca_coll_solo_mpool_return(mca_coll_solo_component.solo_mpool, id, l_seg_count * extent);
-    }
-    return OMPI_SUCCESS;
-}

From 8e4e553c954a3ef9a436e2d36a97cd854b9cb790 Mon Sep 17 00:00:00 2001
From: Xi Luo <dycz0fx@gmail.com>
Date: Tue, 24 Dec 2019 12:07:32 -0500
Subject: [PATCH 7/7] Add MPI_IN_PLACE support in SOLO module

---
 ompi/mca/coll/solo/coll_solo.h           |  8 ++++----
 ompi/mca/coll/solo/coll_solo_allreduce.c | 22 ++++++++++++++++++----
 ompi/mca/coll/solo/coll_solo_reduce.c    | 24 ++++++++++++++++++++----
 3 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/ompi/mca/coll/solo/coll_solo.h b/ompi/mca/coll/solo/coll_solo.h
index 0e378275ba4..be32d159ef8 100644
--- a/ompi/mca/coll/solo/coll_solo.h
+++ b/ompi/mca/coll/solo/coll_solo.h
@@ -160,8 +160,8 @@ static inline void mca_coll_solo_pack_to_shared(void *local_buf, void *shared_bu
         memcpy((char *) shared_buf, (char *) local_buf, count * extent);
     }
     else {
-        MPI_Aint pos = 0;
-        ompi_datatype_pack_external("external32", local_buf, count, dtype, shared_buf, count * extent, &pos);
+        int pos = 0;        
+        MPI_Pack(local_buf, count, dtype, shared_buf, count * extent, &pos, MPI_COMM_SELF);
     }
 }
 
@@ -171,8 +171,8 @@ static inline void mca_coll_solo_unpack_from_shared(void *local_buf, void *share
         memcpy((char *) local_buf, (char *) shared_buf, count * extent);
     }
     else {
-        MPI_Aint pos = 0;
-        ompi_datatype_unpack_external("external32", shared_buf, count * extent, &pos, local_buf, count, dtype);
+        int pos = 0;
+        MPI_Unpack(shared_buf, count * extent, &pos, local_buf, count, dtype, MPI_COMM_SELF);
     }
 }
 
diff --git a/ompi/mca/coll/solo/coll_solo_allreduce.c b/ompi/mca/coll/solo/coll_solo_allreduce.c
index 446495afa68..e850ff92c0a 100644
--- a/ompi/mca/coll/solo/coll_solo_allreduce.c
+++ b/ompi/mca/coll/solo/coll_solo_allreduce.c
@@ -18,7 +18,12 @@ int mca_coll_solo_allreduce_intra(const void *sbuf, void *rbuf,
                                     struct ompi_communicator_t *comm,
                                     mca_coll_base_module_t * module)
 {
-    return mca_coll_solo_allreduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, comm, module);
+    if (ompi_op_is_commute(op)) {
+        return mca_coll_solo_allreduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, comm, module);
+    }
+    else {
+        return ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count, dtype, op, comm, module);
+    }
 }
 
 
@@ -88,7 +93,11 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         int num_segments = (count + seg_count - 1) / seg_count;
         int last_count = count - seg_count * (num_segments - 1);
         for (int i = 0; i < num_segments; i++) {
-            char *temp_sbuf = (char *)sbuf + seg_count * extent * i;
+            char *temp_sbuf;
+            if (sbuf == MPI_IN_PLACE)
+                temp_sbuf = MPI_IN_PLACE;
+            else
+                temp_sbuf = (char *)sbuf + seg_count * extent * i;
             char *temp_rbuf = (char *)rbuf + seg_count * extent * i;
             int temp_count = seg_count;
             if (i == num_segments - 1) {
@@ -100,6 +109,11 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         return MPI_SUCCESS;
     }
 
+    char *sbuf_temp = (char *)sbuf;
+    if( sbuf == MPI_IN_PLACE ) {
+        sbuf_temp = (char *)rbuf;
+    }
+
     *(int *) (solo_module->ctrl_bufs[rank]) = rank;
     mac_coll_solo_barrier_intra(comm, module);
 
@@ -112,13 +126,13 @@ int mca_coll_solo_allreduce_ring_intra_memcpy(const void *sbuf, void *rbuf, int
         }
         /* At first iteration, copy local data to the solo data buffer */
         if (cur == rank) {
-            mca_coll_solo_copy((void *) ((char *) sbuf + cur * l_seg_count * extent), (void *) data_bufs[cur], dtype, seg_count, extent);
+            mca_coll_solo_copy((void *) ((char *) sbuf_temp + cur * l_seg_count * extent), (void *) data_bufs[cur], dtype, seg_count, extent);
             mac_coll_solo_barrier_intra(comm, module);
 
         }
         /* For other iterations, do operations on the solo data buffer */
         else {
-            ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent,
+            ompi_op_reduce(op, (char *) sbuf_temp + cur * l_seg_count * extent,
                            data_bufs[cur], seg_count, dtype);
             mac_coll_solo_barrier_intra(comm, module);
         }
diff --git a/ompi/mca/coll/solo/coll_solo_reduce.c b/ompi/mca/coll/solo/coll_solo_reduce.c
index 0d4cbc4b612..8b498ccc98e 100644
--- a/ompi/mca/coll/solo/coll_solo_reduce.c
+++ b/ompi/mca/coll/solo/coll_solo_reduce.c
@@ -18,7 +18,12 @@ int mca_coll_solo_reduce_intra(const void *sbuf, void *rbuf, int count,
                                  struct ompi_communicator_t *comm, 
                                  mca_coll_base_module_t * module)
 {
-    return mca_coll_solo_reduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, root, comm, module);
+    if (ompi_op_is_commute(op)) {
+        return mca_coll_solo_reduce_ring_intra_memcpy(sbuf, rbuf, count, dtype, op, root, comm, module);
+    }
+    else {
+        return ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype, op, root, comm, module);
+    }
 }
 
 /**
@@ -46,6 +51,7 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
     int size = ompi_comm_size(comm);
     int rank = ompi_comm_rank(comm);
     int i;
+
     ptrdiff_t extent, lower_bound;
     ompi_datatype_get_extent(dtype, &lower_bound, &extent);
 
@@ -89,7 +95,11 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
         int num_segments = (count + seg_count - 1) / seg_count;
         int last_count = count - seg_count * (num_segments - 1);
         for (int i = 0; i < num_segments; i++) {
-            char *temp_sbuf = (char *)sbuf + seg_count * extent * i;
+            char *temp_sbuf;
+            if (sbuf == MPI_IN_PLACE)
+                temp_sbuf = MPI_IN_PLACE;
+            else
+                temp_sbuf = (char *)sbuf + seg_count * extent * i;
             char *temp_rbuf = (char *)rbuf + seg_count * extent * i;
             int temp_count = seg_count;
             if (i == num_segments - 1) {
@@ -101,6 +111,11 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
         return MPI_SUCCESS;
     }
 
+    char *sbuf_temp = (char *)sbuf;
+    if( sbuf == MPI_IN_PLACE ) {
+        sbuf_temp = (char *)rbuf;
+    }
+
     *(int *) (solo_module->ctrl_bufs[rank]) = rank;
     mac_coll_solo_barrier_intra(comm, module);
 
@@ -113,12 +128,13 @@ int mca_coll_solo_reduce_ring_intra_memcpy(const void *sbuf, void *rbuf,
         }
         /* At first iteration, copy local data to the shared data buffer */
         if (cur == rank) {
-            mca_coll_solo_copy((void *) ((char *) sbuf + cur * l_seg_count * extent), (void *) data_bufs[cur], dtype, seg_count, extent);
+            mca_coll_solo_copy((void *) ((char *) sbuf_temp + cur * l_seg_count * extent), 
+                               (void *) data_bufs[cur], dtype, seg_count, extent);
             mac_coll_solo_barrier_intra(comm, module);
         }
         /* For other iterations, do operations on the shared data buffer */
         else {
-            ompi_op_reduce(op, (char *) sbuf + cur * l_seg_count * extent,
+            ompi_op_reduce(op, (char *) sbuf_temp + cur * l_seg_count * extent,
                            data_bufs[cur], seg_count, dtype);
             mac_coll_solo_barrier_intra(comm, module);
         }