From 483c3f91a0fd512a16a55a15ada5d15701377f5c Mon Sep 17 00:00:00 2001 From: David Eberius Date: Thu, 23 Aug 2018 17:16:20 -0400 Subject: [PATCH 1/3] This is an update to the SPC code to add two major features: bin counters and the mmap interface. Bin counters are designed to allow for more context in the counters so that one counter can have several sub-counters attached to it as bins with breakpoints to denote when each bin gets incremented. These are used extensively for collective algorithm counters which count the number of times each collective algorithm is called and under which circumstances small/large communicator/ message. The mmap interface creates a shared file using mmap so tools can attach to this and read SPC values directly using an XML file to point out where the data for each counter resides. This update also includes some bug fixes and quality of life improvements to the code. Signed-off-by: David Eberius --- examples/spc_example.c | 37 +- ompi/mca/coll/base/coll_base_allgather.c | 13 + ompi/mca/coll/base/coll_base_allreduce.c | 11 + ompi/mca/coll/base/coll_base_alltoall.c | 13 + ompi/mca/coll/base/coll_base_barrier.c | 13 + ompi/mca/coll/base/coll_base_bcast.c | 16 + ompi/mca/coll/base/coll_base_gather.c | 7 + ompi/mca/coll/base/coll_base_reduce.c | 13 + ompi/mca/coll/base/coll_base_reduce_scatter.c | 7 + ompi/mca/coll/base/coll_base_scatter.c | 4 + ompi/mca/pml/ob1/pml_ob1_recvfrag.c | 17 +- ompi/mca/pml/ob1/pml_ob1_recvfrag.h | 3 + ompi/mca/pml/ob1/pml_ob1_sendreq.c | 5 +- ompi/mca/pml/ob1/pml_ob1_sendreq.h | 4 + ompi/runtime/Makefile.am | 1 + ompi/runtime/help-mpi-runtime.txt | 24 + ompi/runtime/ompi_mpi_params.c | 48 +- ompi/runtime/ompi_spc.c | 790 ++++++++++++++++-- ompi/runtime/ompi_spc.h | 115 ++- ompi/runtime/params.h | 39 +- 20 files changed, 1085 insertions(+), 95 deletions(-) diff --git a/examples/spc_example.c b/examples/spc_example.c index 11732cd9f40..b7b09c35690 100644 --- a/examples/spc_example.c +++ b/examples/spc_example.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "mpi.h" @@ -47,15 +48,16 @@ int main(int argc, char **argv) message_size = atoi(argv[2]); } - int i, rank, size, provided, num, name_len, desc_len, verbosity, bind, var_class, readonly, continuous, atomic, count, index; + int i, rank, size, provided, num, name_len, desc_len, verbosity, bind, var_class, readonly, continuous, atomic, count, index, xml_index; MPI_Datatype datatype; MPI_T_enum enumtype; MPI_Comm comm; char name[256], description[256]; /* Counter names to be read by ranks 0 and 1 */ - char *counter_names[] = {"runtime_spc_OMPI_BYTES_SENT_USER", - "runtime_spc_OMPI_BYTES_RECEIVED_USER" }; + char *counter_names[] = {"runtime_spc_OMPI_SPC_BYTES_SENT_USER", + "runtime_spc_OMPI_SPC_BYTES_RECEIVED_USER" }; + char *xml_counter = "runtime_spc_OMPI_SPC_XML_FILE"; MPI_Init(NULL, NULL); MPI_T_init_thread(MPI_THREAD_SINGLE, &provided); @@ -68,7 +70,7 @@ int main(int argc, char **argv) } /* Determine the MPI_T pvar indices for the OMPI_BYTES_SENT/RECIEVED_USER SPCs */ - index = -1; + index = xml_index = -1; MPI_T_pvar_get_num(&num); for(i = 0; i < num; i++) { name_len = desc_len = 256; @@ -77,20 +79,27 @@ int main(int argc, char **argv) &readonly, &continuous, &atomic); if( MPI_SUCCESS != rc ) continue; + if(strcmp(name, counter_names[rank]) == 0) { index = i; printf("[%d] %s -> %s\n", rank, name, description); } + if(strcmp(name, xml_counter) == 0) { + xml_index = i; + printf("[%d] %s -> %s (index -> %d)\n", rank, name, description, xml_index); + } } /* Make sure we found the counters */ - if(index == -1) { + if(index == -1 || xml_index == -1) { fprintf(stderr, "ERROR: Couldn't find the appropriate SPC counter in the MPI_T pvars.\n"); MPI_Abort(MPI_COMM_WORLD, -1); } - int ret; + int ret, xml_count; long long value; + char *xml_filename = (char*)malloc(64 * sizeof(char)); + sprintf(xml_filename, "this_is_a_test"); MPI_T_pvar_session session; MPI_T_pvar_handle handle; @@ -99,13 +108,29 @@ int main(int argc, char **argv) ret = MPI_T_pvar_handle_alloc(session, index, NULL, &handle, &count); ret = MPI_T_pvar_start(session, handle); + MPI_T_pvar_session xml_session; + MPI_T_pvar_handle xml_handle; + if(xml_index >= 0) { + ret = MPI_T_pvar_session_create(&xml_session); + ret = MPI_T_pvar_handle_alloc(xml_session, xml_index, NULL, &xml_handle, &xml_count); + printf("xml_count: %d\n", xml_count); + ret = MPI_T_pvar_start(xml_session, xml_handle); + } + message_exchange(num_messages, message_size); ret = MPI_T_pvar_read(session, handle, &value); + if(xml_index >= 0) { + ret = MPI_T_pvar_read(xml_session, xml_handle, &xml_filename); + } + /* Print the counter values in order by rank */ for(i = 0; i < 2; i++) { if(i == rank) { printf("[%d] Value Read: %lld\n", rank, value); + if(xml_index >= 0) { + printf("[%d] Value Read: %s\n", rank, xml_filename); + } fflush(stdout); } MPI_Barrier(MPI_COMM_WORLD); diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c index f3d3fd1d0a7..c03a64c039a 100644 --- a/ompi/mca/coll/base/coll_base_allgather.c +++ b/ompi/mca/coll/base/coll_base_allgather.c @@ -31,6 +31,7 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -99,6 +100,8 @@ int ompi_coll_base_allgather_intra_bruck(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allgather_intra_bruck rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_BRUCK, scount * sdtype->super.size, size); + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -286,6 +289,8 @@ ompi_coll_base_allgather_intra_recursivedoubling(const void *sbuf, int scount, "coll:base:allgather_intra_recursivedoubling rank %d, size %d", rank, size)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING, scount * sdtype->super.size, size); + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -372,6 +377,8 @@ int ompi_coll_base_allgather_intra_ring(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allgather_intra_ring rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_RING, scount * sdtype->super.size, size); + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -508,6 +515,8 @@ ompi_coll_base_allgather_intra_neighborexchange(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allgather_intra_neighborexchange rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE, scount * sdtype->super.size, size); + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -611,6 +620,8 @@ int ompi_coll_base_allgather_intra_two_procs(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_allgather_intra_two_procs rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_TWO_PROCS, scount * sdtype->super.size, 2); + if (2 != ompi_comm_size(comm)) { return MPI_ERR_UNSUPPORTED_OPERATION; } @@ -689,6 +700,8 @@ ompi_coll_base_allgather_intra_basic_linear(const void *sbuf, int scount, int err; ptrdiff_t lb, extent; + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_LINEAR, scount * sdtype->super.size, ompi_comm_size(comm)); + /* Handle MPI_IN_PLACE (see explanantion in reduce.c for how to allocate temp buffer) -- note that rank 0 can use IN_PLACE natively, and we can just alias the right position in rbuf diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index 828b32061a9..1504a2ae9ed 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -36,6 +36,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -63,6 +64,8 @@ ompi_coll_base_allreduce_intra_nonoverlapping(const void *sbuf, void *rbuf, int OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_nonoverlapping rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING, count * dtype->super.size, ompi_comm_size(comm)); + /* Reduce to 0 and broadcast. */ if (MPI_IN_PLACE == sbuf) { @@ -145,6 +148,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_intra_recursivedoubling rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING, count * dtype->super.size, size); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { @@ -358,6 +363,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_intra_ring rank %d, count %d", rank, count)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLREDUCE_RING, count * dtype->super.size, size); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { @@ -637,6 +644,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_intra_ring_segmented rank %d, count %d", rank, count)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED, count * dtype->super.size, size); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { @@ -890,6 +899,8 @@ ompi_coll_base_allreduce_intra_basic_linear(const void *sbuf, void *rbuf, int co OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_basic_linear rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLREDUCE_LINEAR, count * dtype->super.size, ompi_comm_size(comm)); + /* Reduce to 0 and broadcast. */ if (MPI_IN_PLACE == sbuf) { diff --git a/ompi/mca/coll/base/coll_base_alltoall.c b/ompi/mca/coll/base/coll_base_alltoall.c index a61bf40ca97..9d18e1ecc16 100644 --- a/ompi/mca/coll/base/coll_base_alltoall.c +++ b/ompi/mca/coll/base/coll_base_alltoall.c @@ -32,6 +32,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -53,6 +54,8 @@ mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_INPLACE, rcount * rdtype->super.size, size); + /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; @@ -151,6 +154,8 @@ int ompi_coll_base_alltoall_intra_pairwise(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoall_intra_pairwise rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_PAIRWISE, rcount * rdtype->super.size, size); + err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_extent (rdtype, &lb, &rext); @@ -212,6 +217,8 @@ int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoall_intra_bruck rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_BRUCK, rcount * rdtype->super.size, size); + err = ompi_datatype_type_extent (sdtype, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } @@ -358,6 +365,8 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_alltoall_intra_linear_sync rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC, rcount * rdtype->super.size, size); + error = ompi_datatype_get_extent(sdtype, &slb, &sext); if (OMPI_SUCCESS != error) { return error; @@ -512,6 +521,8 @@ int ompi_coll_base_alltoall_intra_two_procs(const void *sbuf, int scount, return MPI_ERR_UNSUPPORTED_OPERATION; } + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_TWO_PROCS, rcount * rdtype->super.size, 2); + err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } @@ -594,6 +605,8 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_alltoall_intra_basic_linear rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_LINEAR, rcount * rdtype->super.size, size); + err = ompi_datatype_get_extent(sdtype, &lb, &sndinc); if (OMPI_SUCCESS != err) { return err; diff --git a/ompi/mca/coll/base/coll_base_barrier.c b/ompi/mca/coll/base/coll_base_barrier.c index 49ac4ea2e9e..7707f126056 100644 --- a/ompi/mca/coll/base/coll_base_barrier.c +++ b/ompi/mca/coll/base/coll_base_barrier.c @@ -33,6 +33,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -109,6 +110,8 @@ int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank)); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_DOUBLE_RING, 1); + left = ((rank-1)%size); right = ((rank+1)%size); @@ -182,6 +185,8 @@ int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *c "ompi_coll_base_barrier_intra_recursivedoubling rank %d", rank)); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_RECURSIVE_DOUBLING, 1); + /* do nearest power of 2 less than size calc */ adjsize = opal_next_poweroftwo(size); adjsize >>= 1; @@ -262,6 +267,8 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_bruck rank %d", rank)); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_BRUCK, 1); + /* exchange data with rank-2^k and rank+2^k */ for (distance = 1; distance < size; distance <<= 1) { from = (rank + size - distance) % size; @@ -304,6 +311,8 @@ int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_two_procs rank %d", remote)); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_TWO_PROCS, 1); + remote = (remote + 1) & 0x1; err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, @@ -338,6 +347,8 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm, return MPI_SUCCESS; rank = ompi_comm_rank(comm); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_LINEAR, 1); + /* All non-root send & receive zero-length message. */ if (rank > 0) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0, @@ -414,6 +425,8 @@ int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm, "ompi_coll_base_barrier_intra_tree %d", rank)); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_TREE, 1); + /* Find the nearest power of 2 of the communicator size. */ depth = opal_next_poweroftwo_inclusive(size); diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index 7af75353d2d..8b55c7e9cd2 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -31,6 +31,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -265,6 +266,8 @@ ompi_coll_base_bcast_intra_bintree ( void* buffer, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_BINTREE, count * datatype->super.size, ompi_comm_size(comm)); + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_bintree ); } @@ -293,6 +296,8 @@ ompi_coll_base_bcast_intra_pipeline( void* buffer, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_PIPELINE, count*typelng, ompi_comm_size(comm)); + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_pipeline ); } @@ -321,6 +326,8 @@ ompi_coll_base_bcast_intra_chain( void* buffer, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), chains, segsize, (unsigned long)typelng, segcount)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_CHAIN, count*typelng, ompi_comm_size(comm)); + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_chain ); } @@ -349,6 +356,8 @@ ompi_coll_base_bcast_intra_binomial( void* buffer, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_BINOMIAL, count*typelng, ompi_comm_size(comm)); + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_bmtree ); } @@ -388,6 +397,8 @@ ompi_coll_base_bcast_intra_split_bintree ( void* buffer, err = ompi_datatype_type_size( datatype, &type_size ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_SPLIT_BINTREE, count*type_size, ompi_comm_size(comm)); + /* Determine number of segments and number of elements per segment */ counts[0] = count/2; if (count % 2 != 0) counts[0]++; @@ -685,6 +696,11 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count, ompi_coll_base_free_reqs(reqs, i); } + size_t typelng; + ompi_datatype_type_size( datatype, &typelng ); + + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_LINEAR, count*typelng, size); + /* All done */ return err; } diff --git a/ompi/mca/coll/base/coll_base_gather.c b/ompi/mca/coll/base/coll_base_gather.c index 6fd1e981461..a7f09f0c2f6 100644 --- a/ompi/mca/coll/base/coll_base_gather.c +++ b/ompi/mca/coll/base/coll_base_gather.c @@ -32,6 +32,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -61,6 +62,8 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_GATHER_BINOMIAL, scount * sdtype->super.size, size); + /* create the binomial tree */ COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; @@ -225,6 +228,8 @@ ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_GATHER_LINEAR_SYNC, scount * sdtype->super.size, size); + if (rank != root) { /* Non-root processes: - receive zero byte message from the root, @@ -384,6 +389,8 @@ ompi_coll_base_gather_intra_basic_linear(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_basic_linear rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_GATHER_LINEAR, scount * sdtype->super.size, size); + if (rank != root) { return MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_GATHER, diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index dfd709bfb90..dcdfaaa9221 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -36,6 +36,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -399,6 +400,8 @@ int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, int c ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_CHAIN, count * datatype->super.size, ompi_comm_size(comm)); + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_chain, @@ -431,6 +434,8 @@ int ompi_coll_base_reduce_intra_pipeline( const void *sendbuf, void *recvbuf, ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_PIPELINE, count * datatype->super.size, ompi_comm_size(comm)); + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_pipeline, @@ -462,6 +467,8 @@ int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf, ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_BINARY, count * datatype->super.size, ompi_comm_size(comm)); + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_bintree, @@ -493,6 +500,8 @@ int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf, ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_BINOMIAL, count * datatype->super.size, ompi_comm_size(comm)); + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_in_order_bmtree, @@ -530,6 +539,8 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv COLL_BASE_UPDATE_IN_ORDER_BINTREE( comm, base_module ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE, count * datatype->super.size, size); + /** * Determine number of segments and number of elements * sent per operation @@ -643,6 +654,8 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_LINEAR, count * dtype->super.size, size); + /* If not root, send data to the root. */ if (rank != root) { diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c index 984a91787a0..92c6d997a70 100644 --- a/ompi/mca/coll/base/coll_base_reduce_scatter.c +++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c @@ -35,6 +35,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -62,6 +63,8 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r for (i = 0, total_count = 0; i < size; i++) { total_count += rcounts[i]; } + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING, total_count * dtype->super.size, size); + /* Reduce to rank 0 (root) and scatterv */ tmprbuf = (char*) rbuf; if (MPI_IN_PLACE == sbuf) { @@ -159,6 +162,8 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, } count = disps[size - 1] + rcounts[size - 1]; + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING, count * dtype->super.size, size); + /* short cut the trivial case */ if (0 == count) { free(disps); @@ -487,6 +492,8 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, const in if (max_block_count < rcounts[i]) max_block_count = rcounts[i]; } + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_SCATTER_RING, total_count * dtype->super.size, size); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { diff --git a/ompi/mca/coll/base/coll_base_scatter.c b/ompi/mca/coll/base/coll_base_scatter.c index 0ca35971532..90df9ab3116 100644 --- a/ompi/mca/coll/base/coll_base_scatter.c +++ b/ompi/mca/coll/base/coll_base_scatter.c @@ -32,6 +32,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -78,6 +79,7 @@ ompi_coll_base_scatter_intra_binomial( OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:scatter_intra_binomial rank %d/%d", rank, size)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_SCATTER_BINOMIAL, scount * sdtype->super.size, size); /* Create the binomial tree */ COLL_BASE_UPDATE_IN_ORDER_BMTREE(comm, base_module, root); @@ -232,6 +234,8 @@ ompi_coll_base_scatter_intra_basic_linear(const void *sbuf, int scount, rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_SCATTER_LINEAR, scount * sdtype->super.size, size); + /* If not root, receive data. */ if (rank != root) { diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 0457d6feb0b..af8eeab8fdb 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -143,6 +143,7 @@ append_frag_to_ordered_list(mca_pml_ob1_recv_frag_t** queue, d1 = d2; prior = (mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_prev); d2 = prior->hdr.hdr_match.hdr_seq - hdr->hdr_seq; + SPC_RECORD(OMPI_SPC_OOS_QUEUE_HOPS, 1); } while( (hdr->hdr_seq < prior->hdr.hdr_match.hdr_seq) && (d1 > d2) && (prior != *queue) ); } else { @@ -150,6 +151,7 @@ append_frag_to_ordered_list(mca_pml_ob1_recv_frag_t** queue, next_seq = ((mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_next))->hdr.hdr_match.hdr_seq; /* prevent rollover */ while( (hdr->hdr_seq > prior_seq) && (hdr->hdr_seq > next_seq) && (prior_seq < next_seq) ) { + SPC_RECORD(OMPI_SPC_OOS_QUEUE_HOPS, 1); prior_seq = next_seq; prior = (mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_next); next_seq = ((mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_next))->hdr.hdr_match.hdr_seq; @@ -334,6 +336,7 @@ check_cantmatch_for_match(mca_pml_ob1_comm_proc_t *proc) mca_pml_ob1_recv_frag_t *frag = proc->frags_cant_match; if( (NULL != frag) && (frag->hdr.hdr_match.hdr_seq == proc->expected_sequence) ) { + SPC_RECORD(OMPI_SPC_OOS_IN_QUEUE, -1); return remove_head_from_ordered_list(&proc->frags_cant_match); } return NULL; @@ -408,6 +411,8 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl); append_frag_to_ordered_list(&proc->frags_cant_match, frag, proc->expected_sequence); SPC_RECORD(OMPI_SPC_OUT_OF_SEQUENCE, 1); + SPC_RECORD(OMPI_SPC_OOS_IN_QUEUE, 1); + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE); OB1_MATCHING_UNLOCK(&comm->matching_lock); return; } @@ -802,7 +807,8 @@ match_one(mca_btl_base_module_t *btl, mca_pml_ob1_recv_frag_t* frag) { #if SPC_ENABLE == 1 - opal_timer_t timer = 0; + opal_timer_t timer; + timer = 0; #endif SPC_TIMER_START(OMPI_SPC_MATCH_TIME, &timer); @@ -856,6 +862,12 @@ match_one(mca_btl_base_module_t *btl, SPC_TIMER_STOP(OMPI_SPC_MATCH_TIME, &timer); return match; } + SPC_TIMER_STOP(OMPI_SPC_MATCH_TIME, &timer); + +#if SPC_ENABLE == 1 + opal_timer_t queue_timer = 0; +#endif + SPC_TIMER_START(OMPI_SPC_MATCH_QUEUE_TIME, &queue_timer); /* if no match found, place on unexpected queue */ #if MCA_PML_OB1_CUSTOM_MATCH @@ -865,12 +877,13 @@ match_one(mca_btl_base_module_t *btl, append_frag_to_list(&proc->unexpected_frags, btl, hdr, segments, num_segments, frag); #endif + SPC_TIMER_STOP(OMPI_SPC_MATCH_QUEUE_TIME, &queue_timer); SPC_RECORD(OMPI_SPC_UNEXPECTED, 1); SPC_RECORD(OMPI_SPC_UNEXPECTED_IN_QUEUE, 1); SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, OMPI_SPC_UNEXPECTED_IN_QUEUE); + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); - SPC_TIMER_STOP(OMPI_SPC_MATCH_TIME, &timer); return NULL; } while(true); } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h index def120ccc62..941471cdd39 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h @@ -28,6 +28,7 @@ #define MCA_PML_OB1_RECVFRAG_H #include "pml_ob1_hdr.h" +#include "ompi/runtime/ompi_spc.h" BEGIN_C_DECLS @@ -85,6 +86,7 @@ do { \ 0); \ _ptr = (unsigned char*)(buffers[0].addr); \ macro_segments[0].seg_addr.pval = buffers[0].addr; \ + SPC_RECORD(OMPI_SPC_QUEUE_ALLOCATION, buffers[0].len); \ } \ macro_segments[0].seg_len = _size; \ for( i = 0; i < cnt; i++ ) { \ @@ -98,6 +100,7 @@ do { \ do { \ if( frag->segments[0].seg_len > mca_pml_ob1.unexpected_limit ) { \ /* return buffers */ \ + SPC_RECORD(OMPI_SPC_QUEUE_ALLOCATION, -frag->buffers[0].len); \ mca_pml_ob1.allocator->alc_free( mca_pml_ob1.allocator, \ frag->buffers[0].addr ); \ } \ diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 88e7f7252c3..b0ac50d461d 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -644,9 +644,10 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, /* send */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH); - SPC_USER_OR_MPI(sendreq->req_send.req_base.req_ompi.req_status.MPI_TAG, (ompi_spc_value_t)size, - OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI); + if( OPAL_LIKELY( rc >= OPAL_SUCCESS ) ) { + SPC_USER_OR_MPI(sendreq->req_send.req_base.req_tag, (ompi_spc_value_t)size, + OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI); if( OPAL_LIKELY( 1 == rc ) ) { mca_pml_ob1_match_completion_free_request( bml_btl, sendreq ); } diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index 4c52c317003..8a39f212990 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -34,6 +34,7 @@ #include "pml_ob1_rdmafrag.h" #include "ompi/mca/bml/bml.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_spc.h" BEGIN_C_DECLS @@ -396,7 +397,9 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq, } #endif /* OPAL_CUDA_GDR_SUPPORT */ + SPC_BIN_RECORD(OMPI_SPC_P2P_MESSAGE_SIZE, size); if( OPAL_LIKELY(size <= eager_limit) ) { + SPC_RECORD(OMPI_SPC_EAGER_MESSAGES, 1); switch(sendreq->req_send.req_send_mode) { case MCA_PML_BASE_SEND_SYNCHRONOUS: rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0); @@ -416,6 +419,7 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq, break; } } else { + SPC_RECORD(OMPI_SPC_NOT_EAGER_MESSAGES, 1); size = eager_limit; if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit)) size = btl->btl_rndv_eager_limit; diff --git a/ompi/runtime/Makefile.am b/ompi/runtime/Makefile.am index e452544844c..646056c4819 100644 --- a/ompi/runtime/Makefile.am +++ b/ompi/runtime/Makefile.am @@ -51,3 +51,4 @@ libompi_mpir_la_SOURCES = \ libompi_mpir_la_CFLAGS = $(MPIR_UNWIND_CFLAGS) lib@OMPI_LIBMPI_NAME@_la_LIBADD += libompi_mpir.la + diff --git a/ompi/runtime/help-mpi-runtime.txt b/ompi/runtime/help-mpi-runtime.txt index 1fcb93a35e9..f2f62796004 100644 --- a/ompi/runtime/help-mpi-runtime.txt +++ b/ompi/runtime/help-mpi-runtime.txt @@ -120,6 +120,7 @@ There was an error registering software performance counters (SPCs) as MPI_T performance variables. Your job will continue, but SPCs will be disabled for MPI_T. # + [no-pmi] PMIx_Init failed for the following reason: @@ -128,3 +129,26 @@ PMIx_Init failed for the following reason: Open MPI requires access to a local PMIx server to execute. Please ensure that either you are operating in a PMIx-enabled environment, or use "mpirun" to execute the job. + +[spc: default shared memory directory failed] +Failed to access the default shared memory directory, falling back to storing the shared memory +file in '%s'. +# +[spc: filename creation failure] +Failed to create an appropriate filename for use in storing data in a +shared memory file. +# +[spc: shm segment creation failure] +Failed to create a shared memory segment. +# +[spc: shm atttach failure] +Failed to attach to created shared memory segment. +# +[spc: shm file open failure] +Failed to open shared memory file: '%s'. +# +[spc: mmap failure] +Failed to mmap the shared memory file. mmap had the following error: +'%s'. You will be unable to read counters through attaching to the +shared memory file. + diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index 07d0ad7b32c..befa28f2918 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2018 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -76,7 +76,13 @@ uint32_t ompi_add_procs_cutoff = OMPI_ADD_PROCS_CUTOFF_DEFAULT; bool ompi_mpi_dynamics_enabled = true; char *ompi_mpi_spc_attach_string = NULL; +char *ompi_mpi_spc_xml_string = NULL; bool ompi_mpi_spc_dump_enabled = false; +bool ompi_mpi_spc_mmap_enabled = false; +int ompi_mpi_spc_snapshot_period = 0; +int ompi_mpi_spc_p2p_message_boundary = 12288; +int ompi_mpi_spc_collective_message_boundary = 12288; +int ompi_mpi_spc_collective_comm_boundary = 64; static bool show_default_mca_params = false; static bool show_file_mca_params = false; @@ -332,6 +338,14 @@ int ompi_mpi_register_params(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_mpi_spc_attach_string); + ompi_mpi_spc_xml_string = NULL; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_xml_string", + "A string to add to SPC XML files for easier identification. The format will be: spc_data.[nodename].[jobid or spc_xml_string].[world_rank].xml", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_xml_string); + ompi_mpi_spc_dump_enabled = false; (void) mca_base_var_register("ompi", "mpi", NULL, "spc_dump_enabled", "A boolean value for whether (true) or not (false) to enable dumping SPC counters in MPI_Finalize.", @@ -340,6 +354,38 @@ int ompi_mpi_register_params(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_mpi_spc_dump_enabled); + ompi_mpi_spc_mmap_enabled = false; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_mmap_enabled", + "A boolean value for whether (true) or not (false) to enable dumping SPC counters to an mmap'd file.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_mmap_enabled); + + ompi_mpi_spc_p2p_message_boundary = 12288; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_p2p_message_boundary", + "An integer value for determining the boundary for whether a message is small/large for point to point message size bin counter (<= this value is small).", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_p2p_message_boundary); + + ompi_mpi_spc_collective_message_boundary = 12288; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_message_boundary", + "An integer value for determining the boundary for whether a message is small/large for collective bin counters (<= this value is small).", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_collective_message_boundary); + + ompi_mpi_spc_collective_comm_boundary = 64; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_comm_boundary", + "An integer value for determining the boundary for whether a communicator is small/large for collective bin counters (<= this value is small).", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_collective_comm_boundary); + return OMPI_SUCCESS; } diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index 654dd158b3e..360431fbc94 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -21,8 +21,11 @@ opal_timer_t sys_clock_freq_mhz = 0; static void ompi_spc_dump(void); -/* Array for converting from SPC indices to MPI_T indices */ +static int mpi_t_offset = -1; static bool mpi_t_enabled = false; +static bool spc_enabled = true; +static bool need_free = false; + static ompi_communicator_t *ompi_spc_comm = NULL; typedef struct ompi_spc_event_t { @@ -136,21 +139,77 @@ static ompi_spc_event_t ompi_spc_events_names[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_BYTES_GET, "The number of bytes sent/received using RMA Get operations both through user-level Get functions and internal Get functions."), SET_COUNTER_ARRAY(OMPI_SPC_UNEXPECTED, "The number of messages that arrived as unexpected messages."), SET_COUNTER_ARRAY(OMPI_SPC_OUT_OF_SEQUENCE, "The number of messages that arrived out of the proper sequence."), + SET_COUNTER_ARRAY(OMPI_SPC_OOS_QUEUE_HOPS, "The number of times we jumped to the next element in the out of sequence message queue's ordered list."), SET_COUNTER_ARRAY(OMPI_SPC_MATCH_TIME, "The number of microseconds spent matching unexpected messages. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), + SET_COUNTER_ARRAY(OMPI_SPC_MATCH_QUEUE_TIME, "The number of microseconds spent inserting unexpected messages into the unexpected message queue. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), SET_COUNTER_ARRAY(OMPI_SPC_UNEXPECTED_IN_QUEUE, "The number of messages that are currently in the unexpected message queue(s) of an MPI process."), SET_COUNTER_ARRAY(OMPI_SPC_OOS_IN_QUEUE, "The number of messages that are currently in the out of sequence message queue(s) of an MPI process."), SET_COUNTER_ARRAY(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, "The maximum number of messages that the unexpected message queue(s) within an MPI process " "contained at once since the last reset of this counter. Note: This counter is reset each time it is read."), SET_COUNTER_ARRAY(OMPI_SPC_MAX_OOS_IN_QUEUE, "The maximum number of messages that the out of sequence message queue(s) within an MPI process " - "contained at once since the last reset of this counter. Note: This counter is reset each time it is read.") + "contained at once since the last reset of this counter. Note: This counter is reset each time it is read."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_LINEAR, "The number of times the base broadcast used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_CHAIN, "The number of times the base broadcast used the chain algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_PIPELINE, "The number of times the base broadcast used the pipeline algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_SPLIT_BINTREE, "The number of times the base broadcast used the split binary tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_BINTREE, "The number of times the base broadcast used the binary tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_BINOMIAL, "The number of times the base broadcast used the binomial algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_CHAIN, "The number of times the base reduce used the chain algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_PIPELINE, "The number of times the base reduce used the pipeline algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_BINARY, "The number of times the base reduce used the binary tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_BINOMIAL, "The number of times the base reduce used the binomial tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE, "The number of times the base reduce used the in order binary tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_LINEAR, "The number of times the base reduce used the basic linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING, "The number of times the base reduce scatter used the nonoverlapping algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING, "The number of times the base reduce scatter used the recursive halving algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_SCATTER_RING, "The number of times the base reduce scatter used the ring algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING, "The number of times the base allreduce used the nonoverlapping algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING, "The number of times the base allreduce used the recursive doubling algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLREDUCE_RING, "The number of times the base allreduce used the ring algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED, "The number of times the base allreduce used the segmented ring algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLREDUCE_LINEAR, "The number of times the base allreduce used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_SCATTER_BINOMIAL, "The number of times the base scatter used the binomial tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_SCATTER_LINEAR, "The number of times the base scatter used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_GATHER_BINOMIAL, "The number of times the base gather used the binomial tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_GATHER_LINEAR_SYNC, "The number of times the base gather used the synchronous linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_GATHER_LINEAR, "The number of times the base gather used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_INPLACE, "The number of times the base alltoall used the in-place algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_PAIRWISE, "The number of times the base alltoall used the pairwise algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_BRUCK, "The number of times the base alltoall used the bruck algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC, "The number of times the base alltoall used the synchronous linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_TWO_PROCS, "The number of times the base alltoall used the two process algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_LINEAR, "The number of times the base alltoall used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_BRUCK, "The number of times the base allgather used the bruck algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING, "The number of times the base allgather used the recursive doubling algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_RING, "The number of times the base allgather used the ring algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE, "The number of times the base allgather used the neighbor exchange algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_TWO_PROCS, "The number of times the base allgather used the two process algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_LINEAR, "The number of times the base allgather used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_DOUBLE_RING, "The number of times the base barrier used the double ring algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_RECURSIVE_DOUBLING, "The number of times the base barrier used the recursive doubling algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_BRUCK, "The number of times the base barrier used the bruck algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_TWO_PROCS, "The number of times the base barrier used the two process algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_LINEAR, "The number of times the base barrier used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_TREE, "The number of times the base barrier used the tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_P2P_MESSAGE_SIZE, "This is a bin counter with two subcounters. The first is messages that are less than or equal to 12288 bytes and the second is those that are larger than 12288 bytes."), + SET_COUNTER_ARRAY(OMPI_SPC_EAGER_MESSAGES, "The number of messages that fall within the eager size."), + SET_COUNTER_ARRAY(OMPI_SPC_NOT_EAGER_MESSAGES, "The number of messages that do not fall within the eager size."), + SET_COUNTER_ARRAY(OMPI_SPC_QUEUE_ALLOCATION, "The amount of memory allocated after runtime currently in use for temporary message queues like the unexpected message queue and the out of sequence message queue.") }; -/* An array of integer values to denote whether an event is activated (1) or not (0) */ -static uint32_t ompi_spc_attached_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; -/* An array of integer values to denote whether an event is timer-based (1) or not (0) */ +/* A bitmap to denote whether an event is activated (1) or not (0) */ +OMPI_DECLSPEC uint32_t ompi_spc_attached_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; +/* A bitmap to denote whether an event is timer-based (1) or not (0) */ static uint32_t ompi_spc_timer_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; +/* A bitmap to denote whether an event is bin-based (1) or not (0) */ +static uint32_t ompi_spc_bin_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; +/* A bitmap to denote whether an event is collective bin-based (1) or not (0) */ +static uint32_t ompi_spc_collective_bin_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; + /* An array of event structures to store the event data (name and value) */ -static ompi_spc_t *ompi_spc_events = NULL; +void *ompi_spc_events = NULL; +static ompi_spc_offset_t ompi_spc_offsets[OMPI_SPC_NUM_COUNTERS] = {-1}; +static ompi_spc_value_t *ompi_spc_values = NULL; static inline void SET_SPC_BIT(uint32_t* array, int32_t pos) { @@ -158,7 +217,7 @@ static inline void SET_SPC_BIT(uint32_t* array, int32_t pos) array[pos / (8 * sizeof(uint32_t))] |= (1U << (pos % (8 * sizeof(uint32_t)))); } -static inline bool IS_SPC_BIT_SET(uint32_t* array, int32_t pos) +inline bool IS_SPC_BIT_SET(uint32_t* array, int32_t pos) { assert(pos < OMPI_SPC_NUM_COUNTERS); return !!(array[pos / (8 * sizeof(uint32_t))] & (1U << (pos % (8 * sizeof(uint32_t))))); @@ -188,19 +247,95 @@ static int ompi_spc_notify(mca_base_pvar_t *pvar, mca_base_pvar_event_t event, v index = (int)(uintptr_t)pvar->ctx; /* Convert from MPI_T pvar index to SPC index */ /* For this event, we need to set count to the number of long long type - * values for this counter. All SPC counters are one long long, so we - * always set count to 1. + * values for this counter. Most SPC counters are one long long so the + * default is 1, however bin counters and the xml string can be longer. */ - if(MCA_BASE_PVAR_HANDLE_BIND == event) { - *count = 1; + do { + if(MCA_BASE_PVAR_HANDLE_BIND == event) { + /* Convert from MPI_T pvar index to SPC index */ + index = pvar->pvar_index - mpi_t_offset; + if(index < 0) { + char *shm_dir; + if(0 == access(SPC_SHM_DIR, W_OK)) { + shm_dir = SPC_SHM_DIR; + } else { + opal_show_help("help-mpi-runtime.txt", "spc: default shared memory directory failed", true, opal_process_info.job_session_dir); + shm_dir = opal_process_info.job_session_dir; + } + + int rank = ompi_comm_rank(ompi_spc_comm), rc; + char filename[SPC_MAX_FILENAME]; + + if(ompi_mpi_spc_xml_string == NULL) { + rc = snprintf(filename, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%d.%d.xml", shm_dir, + opal_process_info.nodename, OPAL_PROC_MY_NAME.jobid, rank); + } else { + rc = snprintf(filename, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%s.%d.xml", shm_dir, + opal_process_info.nodename, ompi_mpi_spc_xml_string, rank); + } + + *count = strlen(filename); + break; + } + if( IS_SPC_BIT_SET(ompi_spc_bin_event, index) ) { /* TODO: make sure this works */ + *count = *(int*)(ompi_spc_events+ompi_spc_offsets[OMPI_SPC_P2P_MESSAGE_SIZE].rules_offset); + printf("Count: %d\n", *count); + } else { + *count = 1; + } + } + /* For this event, we need to turn on the counter */ + else if(MCA_BASE_PVAR_HANDLE_START == event) { + /* Convert from MPI_T pvar index to SPC index */ + index = pvar->pvar_index - mpi_t_offset; + if(index > 0) { + SET_SPC_BIT(ompi_spc_attached_event, index); + } + } + /* For this event, we need to turn off the counter */ + else if(MCA_BASE_PVAR_HANDLE_STOP == event) { + /* Convert from MPI_T pvar index to SPC index */ + index = pvar->pvar_index - mpi_t_offset; + if(index > 0) { + CLEAR_SPC_BIT(ompi_spc_attached_event, index); + } + } + } while(0); + + return MPI_SUCCESS; +} + +static int ompi_spc_get_xml_filename(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) + __opal_attribute_unused__; + +static int ompi_spc_get_xml_filename(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) +{ + int rc; + char **filename, *shm_dir; + + if(OPAL_LIKELY(!mpi_t_enabled)) { + filename = (char**)value; + rc = sprintf(*filename, ""); + + return MPI_SUCCESS; } - /* For this event, we need to turn on the counter */ - else if(MCA_BASE_PVAR_HANDLE_START == event) { - SET_SPC_BIT(ompi_spc_attached_event, index); + + if(0 == access(SPC_SHM_DIR, W_OK)) { + shm_dir = SPC_SHM_DIR; + } else { + opal_show_help("help-mpi-runtime.txt", "spc: default shared memory directory failed", true, opal_process_info.job_session_dir); + shm_dir = opal_process_info.job_session_dir; } - /* For this event, we need to turn off the counter */ - else if(MCA_BASE_PVAR_HANDLE_STOP == event) { - CLEAR_SPC_BIT(ompi_spc_attached_event, index); + + int rank = ompi_comm_rank(ompi_spc_comm); + + filename = (char**)value; + if(ompi_mpi_spc_xml_string == NULL) { + rc = sprintf(*filename, "%s" OPAL_PATH_SEP "spc_data.%s.%d.%d.xml", shm_dir, + opal_process_info.nodename, OPAL_PROC_MY_NAME.jobid, rank); + } else { + rc = sprintf(*filename, "%s" OPAL_PATH_SEP "spc_data.%s.%s.%d.xml", shm_dir, + opal_process_info.nodename, ompi_mpi_spc_xml_string, rank); } return MPI_SUCCESS; @@ -221,24 +356,36 @@ static int ompi_spc_get_count(const struct mca_base_pvar_t *pvar, void *value, v static int ompi_spc_get_count(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) { - long long *counter_value = (long long*)value; - if(OPAL_LIKELY(!mpi_t_enabled)) { + long long *counter_value = (long long*)value; *counter_value = 0; return MPI_SUCCESS; } /* Convert from MPI_T pvar index to SPC index */ - int index = (int)(uintptr_t)pvar->ctx; + int index = pvar->pvar_index - mpi_t_offset; + + /* If this is a bin-based counter, set 'value' to the array of bin values */ + if( IS_SPC_BIT_SET(ompi_spc_bin_event, index) || IS_SPC_BIT_SET(ompi_spc_collective_bin_event, index) ) { + long long **bin_value = (long long**)value; + *bin_value = (long long*)(ompi_spc_events+ompi_spc_offsets[index].bins_offset); + return MPI_SUCCESS; + } + + long long *counter_value = (long long*)value; /* Set the counter value to the current SPC value */ - *counter_value = (long long)ompi_spc_events[index].value; + *counter_value = ompi_spc_values[index]; + /* If this is a timer-based counter, convert from cycles to microseconds */ if( IS_SPC_BIT_SET(ompi_spc_timer_event, index) ) { *counter_value /= sys_clock_freq_mhz; } /* If this is a high watermark counter, reset it after it has been read */ - if(index == OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE || index == OMPI_SPC_MAX_OOS_IN_QUEUE) { - ompi_spc_events[index].value = 0; + if(index == OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE) { + ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_UNEXPECTED_IN_QUEUE]; + } + if(index == OMPI_SPC_MAX_OOS_IN_QUEUE) { + ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_OOS_IN_QUEUE]; } return MPI_SUCCESS; @@ -247,26 +394,336 @@ static int ompi_spc_get_count(const struct mca_base_pvar_t *pvar, void *value, v /* Initializes the events data structure and allocates memory for it if needed. */ void ompi_spc_events_init(void) { - int i; + ompi_comm_dup(&ompi_mpi_comm_world.comm, &ompi_spc_comm); + + int i, value_offset = 0, bin_offset = OMPI_SPC_NUM_COUNTERS*sizeof(ompi_spc_value_t), rank = ompi_comm_rank(ompi_spc_comm), shm_fd, rc, ret; + char filename[SPC_MAX_FILENAME], *shm_dir; + void *ptr; + + if(0 > rc) { + opal_show_help("help-mpi-runtime.txt", "spc: filename creation failure", true); + } + + FILE *fptr, *shm_fptr = NULL; + char sm_file[SPC_MAX_FILENAME], *my_segment; + opal_shmem_ds_t shm_ds; - /* If the events data structure hasn't been allocated yet, allocate memory for it */ - if(NULL == ompi_spc_events) { - ompi_spc_events = (ompi_spc_t*)malloc(OMPI_SPC_NUM_COUNTERS * sizeof(ompi_spc_t)); - if(ompi_spc_events == NULL) { + if(ompi_mpi_spc_mmap_enabled) { + /* Determine the location for saving the shared memory file */ + if(0 == access(SPC_SHM_DIR, W_OK)) { + shm_dir = SPC_SHM_DIR; + } else { + opal_show_help("help-mpi-runtime.txt", "spc: default shared memory directory failed", true, opal_process_info.job_session_dir); + shm_dir = opal_process_info.job_session_dir; + } + + /* Create a shared memory file */ + + rc = snprintf(sm_file, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%d.%d", shm_dir, + opal_process_info.nodename, OPAL_PROC_MY_NAME.jobid, rank); + + if (0 > rc) { + opal_show_help("help-mpi-runtime.txt", "spc: filename creation failure", true); + } + + if(ompi_mpi_spc_xml_string == NULL) { + rc = snprintf(filename, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%d.%d.xml", shm_dir, + opal_process_info.nodename, OPAL_PROC_MY_NAME.jobid, rank); + } else { + rc = snprintf(filename, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%s.%d.xml", shm_dir, + opal_process_info.nodename, ompi_mpi_spc_xml_string, rank); + } + fptr = fopen(filename, "w+"); + + /* Registers the name/path of the XML file as an MPI_T pvar */ + ret = mca_base_pvar_register("ompi", "runtime", "spc", "OMPI_SPC_XML_FILE", "The filename for the SPC XML file for using the mmap interface.", + OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_GENERIC, + MCA_BASE_VAR_TYPE_STRING, NULL, MPI_T_BIND_NO_OBJECT, + MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, + ompi_spc_get_xml_filename, NULL, ompi_spc_notify, NULL); + if(ret < 0) { + printf("There was an error -> %s\n", opal_strerror(ret)); + } + + + fprintf(fptr, "\n"); + fprintf(fptr, "\n"); + } + + /* ######################################################################## + * ################## Add Timer Based Counter Enums Here ################## + * ######################################################################## + */ + + SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_TIME); + SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_QUEUE_TIME); + + /* ############################################################################### + * ###################### Put Bin Counter Sizes Here ############################# + * ############################################################################### + */ + int data_size = OMPI_SPC_NUM_COUNTERS * sizeof(ompi_spc_value_t); + + /* NOTE: If there are an odd number of bins, there could potentially be some false + * sharing with other counters, so make sure the data size is incremented by + * a multiple of cache line size (typically 8 bytes). + */ + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_P2P_MESSAGE_SIZE); + ompi_spc_offsets[OMPI_SPC_P2P_MESSAGE_SIZE].num_bins = 2; + data_size += 2 * (sizeof(int) + sizeof(ompi_spc_value_t)); + + /* ######################################################################## + * ############## Add Collective Bin-Based Counter Enums Here ############# + * ######################################################################## + */ + /* For each collective bin counter we must set the bitmap bit, allocate memory for the arrays and populate the bin_rules array */ + + /* Allgather Algorithms */ + /* Collective bin counter for the Bruck Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_BRUCK); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_BRUCK); + /* Collective bin counter for the Recursive Doubling Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING); + /* Collective bin counter for the Ring Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_RING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_RING); + /* Collective bin counter for the Neighbor Exchange Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE); + /* Collective bin counter for the Two Process Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_TWO_PROCS); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_TWO_PROCS); + /* Collective bin counter for the Linear Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_LINEAR); + + /* Allreduce Algorithms */ + /* Collective bin counter for the Nonoverlapping Allreduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING); + /* Collective bin counter for the Recursive Doubling Allreduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING); + /* Collective bin counter for the Ring Allreduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLREDUCE_RING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLREDUCE_RING); + /* Collective bin counter for the Segmented Ring Allreduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED); + /* Collective bin counter for the Linear Allreduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLREDUCE_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLREDUCE_LINEAR); + + /* All-to-All Algorithms */ + /* Collective bin counter for the Inplace Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_INPLACE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_INPLACE); + /* Collective bin counter for the Pairwise Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_PAIRWISE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_PAIRWISE); + /* Collective bin counter for the Bruck Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_BRUCK); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_BRUCK); + /* Collective bin counter for the Linear Sync Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC); + /* Collective bin counter for the Two Process Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_TWO_PROCS); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_TWO_PROCS); + /* Collective bin counter for the Linear Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_LINEAR); + + /* Broadcast Algorithms */ + /* Collective bin counter for the Chain Broadcast algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_BCAST_CHAIN); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_BCAST_CHAIN); + /* Collective bin counter for the Binomial Broadcast algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_BCAST_BINOMIAL); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_BCAST_BINOMIAL); + /* Collective bin counter for the Pipeline Broadcast algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_BCAST_PIPELINE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_BCAST_PIPELINE); + /* Collective bin counter for the Split Binary Tree Broadcast algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_BCAST_SPLIT_BINTREE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_BCAST_SPLIT_BINTREE); + /* Collective bin counter for the Binary Tree Broadcast algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_BCAST_BINTREE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_BCAST_BINTREE); + + /* Gather Algorithms */ + /* Collective bin counter for the Binomial Gather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_GATHER_BINOMIAL); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_GATHER_BINOMIAL); + /* Collective bin counter for the Linear Sync Gather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_GATHER_LINEAR_SYNC); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_GATHER_LINEAR_SYNC); + /* Collective bin counter for the Linear Gather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_GATHER_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_GATHER_LINEAR); + + /* Reduce Algorithms */ + /* Collective bin counter for the Chain Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_CHAIN); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_CHAIN); + /* Collective bin counter for the Pipeline Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_PIPELINE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_PIPELINE); + /* Collective bin counter for the Binary Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_BINARY); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_BINARY); + /* Collective bin counter for the Binomial Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_BINOMIAL); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_BINOMIAL); + /* Collective bin counter for the In Order Binary Tree Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE); + /* Collective bin counter for the Linear Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_LINEAR); + + /* Reduce Scatter Algorithms */ + /* Collective bin counter for the Nonoverlapping Reduce Scatter algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING); + /* Collective bin counter for the Recursive Halving Reduce Scatter algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING); + /* Collective bin counter for the Ring Reduce Scatter algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_RING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_RING); + + /* Scatter Algorithms */ + /* Collective bin counter for the Binomial Scatter algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_SCATTER_BINOMIAL); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_SCATTER_BINOMIAL); + /* Collective bin counter for the Linear Scatter algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_SCATTER_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_SCATTER_LINEAR); + +#if 0 + /* X Algorithms */ + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); +#endif + + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { + if(IS_SPC_BIT_SET(ompi_spc_collective_bin_event,i)) { + data_size += 4 * (sizeof(int) + sizeof(ompi_spc_value_t)); + ompi_spc_offsets[i].num_bins = 4; + } + } + + /* ############################################################################### + * ############################################################################### + * ############################################################################### + */ + + int bytes_needed = PAGE_SIZE * ((data_size + PAGE_SIZE - 1) % PAGE_SIZE); + + if(ompi_mpi_spc_mmap_enabled) { + rc = opal_shmem_segment_create(&shm_ds, sm_file, bytes_needed); + if (OPAL_SUCCESS != rc) { + opal_show_help("help-mpi-runtime.txt", "spc: shm segment creation failure", true); + } + int default_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; + shm_fd = open(sm_file, O_RDWR | O_CREAT | O_NONBLOCK, default_permissions); + if(0 > shm_fd) { + opal_show_help("help-mpi-runtime.txt", "spc: shm file open failure", true, strerror(errno)); + } + + my_segment = opal_shmem_segment_attach(&shm_ds); + if(NULL == my_segment) { + opal_show_help("help-mpi-runtime.txt", "spc: shm attach failure", true); + } + } + + /* If the mmap fails, we can fall back to malloc to allocate the data. If malloc fails, then we can't + * continue and the counters will have to be disabled. + */ + if(!ompi_mpi_spc_mmap_enabled) { + goto map_failed; + } + if(MAP_FAILED == (ompi_spc_events = mmap(0, bytes_needed, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0))) { + opal_show_help("help-mpi-runtime.txt", "spc: mmap failure", true, strerror(errno)); + map_failed: + ompi_spc_events = NULL; + /* mmap failed, so try malloc */ + if(NULL == (ompi_spc_events = malloc(data_size))) { opal_show_help("help-mpi-runtime.txt", "lib-call-fail", true, "malloc", __FILE__, __LINE__); + spc_enabled = false; return; + } else { + need_free = true; /* Since we malloc'd this data we will need to free it */ } } + + ompi_spc_values = (ompi_spc_value_t*)ompi_spc_events; + + if(ompi_mpi_spc_mmap_enabled) { + fprintf(fptr, "\t%s\n", sm_file); + fprintf(fptr, "\t%d\n", OMPI_SPC_NUM_COUNTERS * sizeof(ompi_spc_t)); + fprintf(fptr, "\t%d\n", OMPI_SPC_NUM_COUNTERS); + fprintf(fptr, "\t%d\n", sys_clock_freq_mhz); + } + /* The data structure has been allocated, so we simply initialize all of the counters * with their names and an initial count of 0. */ for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { - ompi_spc_events[i].name = (char*)ompi_spc_events_names[i].counter_name; - ompi_spc_events[i].value = 0; + ompi_spc_values[i] = 0; + + /* Add this counter to the XML document */ + if(ompi_mpi_spc_mmap_enabled) { + fprintf(fptr, "\t\n"); + fprintf(fptr, "\t\t%s\n", ompi_spc_events_names[i].counter_name); + fprintf(fptr, "\t\t%d\n", value_offset); + } + value_offset += sizeof(ompi_spc_value_t); + + if(ompi_spc_offsets[i].num_bins > 0) { + ompi_spc_offsets[i].rules_offset = bin_offset; + bin_offset += ompi_spc_offsets[i].num_bins*sizeof(int); + ompi_spc_offsets[i].bins_offset = bin_offset; + bin_offset += ompi_spc_offsets[i].num_bins*sizeof(ompi_spc_value_t); + + int mod = bin_offset % CACHE_LINE; + if(mod != 0) { + bin_offset += CACHE_LINE - mod; + } + } else { + ompi_spc_offsets[i].rules_offset = -1; + ompi_spc_offsets[i].bins_offset = -1; + } + if(ompi_mpi_spc_mmap_enabled) { + fprintf(fptr, "\t\t%d\n", ompi_spc_offsets[i].rules_offset); + fprintf(fptr, "\t\t%d\n", ompi_spc_offsets[i].bins_offset); + fprintf(fptr, "\t\n"); + } } - ompi_comm_dup(&ompi_mpi_comm_world.comm, &ompi_spc_comm); + if(ompi_mpi_spc_mmap_enabled) { + fprintf(fptr, "\n"); + fclose(fptr); + } } /* Initializes the SPC data structures and registers all counters as MPI_T pvars. @@ -280,6 +737,9 @@ void ompi_spc_init(void) sys_clock_freq_mhz = opal_timer_base_get_freq() / 1000000; ompi_spc_events_init(); + if(!spc_enabled) { + return; + } /* Get the MCA params string of counters to turn on */ char **arg_strings = opal_argv_split(ompi_mpi_spc_attach_string, ','); @@ -308,8 +768,7 @@ void ompi_spc_init(void) } } } - - if (matched) { + if( matched ) { SET_SPC_BIT(ompi_spc_attached_event, i); mpi_t_enabled = true; found++; @@ -317,19 +776,60 @@ void ompi_spc_init(void) /* Registers the current counter as an MPI_T pvar regardless of whether it's been turned on or not */ ret = mca_base_pvar_register("ompi", "runtime", "spc", ompi_spc_events_names[i].counter_name, ompi_spc_events_names[i].counter_description, - OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_SIZE, + OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_COUNTER, MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, NULL, MPI_T_BIND_NO_OBJECT, MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, - ompi_spc_get_count, NULL, ompi_spc_notify, (void*)(uintptr_t)i); - if( ret < 0 ) { + ompi_spc_get_count, NULL, ompi_spc_notify, NULL); + + /* Check to make sure that ret is a valid index and not an error code. + */ + if( ret >= 0 ) { + if( mpi_t_offset == -1 ) { + mpi_t_offset = ret; + } + } + if( (ret < 0) || (all_on && (ret != (mpi_t_offset + found - 1))) ) { + printf("ret -> %d\n", ret); mpi_t_enabled = false; opal_show_help("help-mpi-runtime.txt", "spc: MPI_T disabled", true); break; } } - /* If this is a timer event, set the corresponding timer_event entry */ - SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_TIME); + /* ######################################################################## + * ###################### Initialize Bin Counters Here #################### + * ######################################################################## + */ + + int *rules = NULL; + ompi_spc_value_t *bins = NULL; + + rules = (int*)(ompi_spc_events+ompi_spc_offsets[OMPI_SPC_P2P_MESSAGE_SIZE].rules_offset); + bins = (ompi_spc_value_t*)(ompi_spc_events+ompi_spc_offsets[OMPI_SPC_P2P_MESSAGE_SIZE].bins_offset); + + bins[0] = bins[1] = 0; + + rules[0] = 2; /* The number of bins */ + rules[1] = ompi_mpi_spc_p2p_message_boundary; /* The number after which counters go in the second bin */ + + /* Initialize Collective Bin Counters Here */ + int num_bins = 4; /* TODO: make these user-defined */ + + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { + if(IS_SPC_BIT_SET(ompi_spc_collective_bin_event,i)) { + rules = (int*)(ompi_spc_events+ompi_spc_offsets[i].rules_offset); + bins = (ompi_spc_value_t*)(ompi_spc_events+ompi_spc_offsets[i].bins_offset); + + bins[0] = bins[1] = bins[2] = bins[3] = 0; + + rules[0] = num_bins; /* The number of bins */ + rules[1] = ompi_mpi_spc_collective_message_boundary; /* The 'small message' break point */ + rules[2] = ompi_mpi_spc_collective_comm_boundary; /* The 'small communicator' break point */ + rules[3] = 0; /* Placeholder for now */ + + ompi_spc_offsets[i].num_bins = 4; + } + } opal_argv_free(arg_strings); } @@ -339,8 +839,10 @@ void ompi_spc_init(void) */ static void ompi_spc_dump(void) { - int i, j, world_size, offset; + int i, j, k, world_size, offset, bin_offset; long long *recv_buffer = NULL, *send_buffer; + int *rules; + ompi_spc_value_t *bins; int rank = ompi_comm_rank(ompi_spc_comm); world_size = ompi_comm_size(ompi_spc_comm); @@ -348,30 +850,65 @@ static void ompi_spc_dump(void) /* Convert from cycles to usecs before sending */ for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { if( IS_SPC_BIT_SET(ompi_spc_timer_event, i) ) { - SPC_CYCLES_TO_USECS(&ompi_spc_events[i].value); + SPC_CYCLES_TO_USECS(&ompi_spc_values[i]); + } + } + + size_t buffer_size = OMPI_SPC_NUM_COUNTERS * sizeof(long long); + int buffer_len = OMPI_SPC_NUM_COUNTERS; + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++){ + if(IS_SPC_BIT_SET(ompi_spc_bin_event, i)) { + /* Increment the buffer size enough to store the bin_rules and bins values */ + rules = (int*)(ompi_spc_events+ompi_spc_offsets[i].rules_offset); + buffer_size += rules[0] * 2 * sizeof(long long); + buffer_len += rules[0] * 2; } } /* Aggregate all of the information on rank 0 using MPI_Gather on MPI_COMM_WORLD */ - send_buffer = (long long*)malloc(OMPI_SPC_NUM_COUNTERS * sizeof(long long)); + send_buffer = (long long*)malloc(buffer_size); if (NULL == send_buffer) { opal_show_help("help-mpi-runtime.txt", "lib-call-fail", true, "malloc", __FILE__, __LINE__); return; } + bin_offset = OMPI_SPC_NUM_COUNTERS; for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { - send_buffer[i] = (long long)ompi_spc_events[i].value; + send_buffer[i] = (long long)ompi_spc_values[i]; + /* If this is a bin counter we need to append its arrays to the end of the send buffer */ + if(IS_SPC_BIT_SET(ompi_spc_bin_event, i)) { + rules = (int*)(ompi_spc_events+ompi_spc_offsets[i].rules_offset); + for(j = 0; j < rules[0]; j++) { + send_buffer[bin_offset] = (long long)rules[j]; + bin_offset++; + } + /* A flag to check if all bins are 0 */ + int is_empty = 1; + bins = (ompi_spc_value_t*)(ompi_spc_events+ompi_spc_offsets[i].bins_offset); + for(j = 0; j < rules[0]; j++) { + send_buffer[bin_offset] = (long long)bins[j]; + bin_offset++; + + if(bins[j] > 0) { + is_empty = 0; + } + } + /* Even if all bins are 0 we still send it for ease, even though it won't be printed */ + if(!is_empty && !IS_SPC_BIT_SET(ompi_spc_collective_bin_event, i)) { + send_buffer[i] = 1; + } + } } if( 0 == rank ) { - recv_buffer = (long long*)malloc(world_size * OMPI_SPC_NUM_COUNTERS * sizeof(long long)); + recv_buffer = (long long*)malloc(world_size * buffer_size); if (NULL == recv_buffer) { opal_show_help("help-mpi-runtime.txt", "lib-call-fail", true, "malloc", __FILE__, __LINE__); return; } } - (void)ompi_spc_comm->c_coll->coll_gather(send_buffer, OMPI_SPC_NUM_COUNTERS, MPI_LONG_LONG, - recv_buffer, OMPI_SPC_NUM_COUNTERS, MPI_LONG_LONG, + (void)ompi_spc_comm->c_coll->coll_gather(send_buffer, buffer_len, MPI_LONG_LONG, + recv_buffer, buffer_len, MPI_LONG_LONG, 0, ompi_spc_comm, ompi_spc_comm->c_coll->coll_gather_module); @@ -379,25 +916,65 @@ static void ompi_spc_dump(void) if(rank == 0) { opal_output(0, "Open MPI Software-based Performance Counters:\n"); offset = 0; /* Offset into the recv_buffer for each rank */ + bin_offset = OMPI_SPC_NUM_COUNTERS; for(j = 0; j < world_size; j++) { opal_output(0, "MPI_COMM_WORLD Rank %d:\n", j); for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { - /* If this is a timer-based counter, we need to covert from cycles to usecs */ + /* Don't print counters with zero values */ if( 0 == recv_buffer[offset+i] ) { + if(IS_SPC_BIT_SET(ompi_spc_bin_event, i)) { + bin_offset += recv_buffer[bin_offset]*2; + } continue; } - opal_output(0, "%s -> %lld\n", ompi_spc_events[i].name, recv_buffer[offset+i]); + /* This is a non-zero bin counter */ + if(IS_SPC_BIT_SET(ompi_spc_bin_event, i)) { + /* This is a non-zero collective bin counter */ + if(IS_SPC_BIT_SET(ompi_spc_collective_bin_event, i)) { + opal_output(0, "%s -> %lld\n", ompi_spc_events_names[i].counter_name, recv_buffer[offset+i]); + + int num_bins = recv_buffer[bin_offset]; + int message_boundary = recv_buffer[bin_offset+1]; + int process_boundary = recv_buffer[bin_offset+2]; + + opal_output(0, "\tSmall Messages (<= %lld bytes)\n", message_boundary); + opal_output(0, "\t\tSmall Comm (<= %lld processes) -> %lld\n", process_boundary, recv_buffer[bin_offset+num_bins]); + opal_output(0, "\t\tLarge Comm (> %lld processes) -> %lld\n", process_boundary, recv_buffer[bin_offset+num_bins+1]); + opal_output(0, "\tLarge Messages (> %lld bytes)\n", message_boundary); + opal_output(0, "\t\tSmall Comm (<= %lld processes) -> %lld\n", process_boundary, recv_buffer[bin_offset+num_bins+2]); + opal_output(0, "\t\tLarge Comm (> %lld processes) -> %lld\n", process_boundary, recv_buffer[bin_offset+num_bins+3]); + + bin_offset += num_bins*2; + continue; + } + opal_output(0, "%s\n", ompi_spc_events_names[i].counter_name); + int num_bins = recv_buffer[bin_offset]; + for(k = 0; k < num_bins; k++){ + if(k == 0) { + opal_output(0, "\t-inf to %lld -> %lld\n", recv_buffer[bin_offset+1], recv_buffer[bin_offset+num_bins]); + } else if(k < num_bins-1) { + opal_output(0, "\t%lld to %lld -> %lld\n", recv_buffer[bin_offset+k]+1, recv_buffer[bin_offset+k+1], recv_buffer[bin_offset+num_bins+k]); + } else { + opal_output(0, "\t%lld to inf -> %lld\n", recv_buffer[bin_offset+k]+1, recv_buffer[bin_offset+num_bins+k]); + } + } + bin_offset += num_bins*2; + continue; + } + /* This is a non-zero normal counter */ + opal_output(0, "%s -> %lld\n", ompi_spc_events_names[i].counter_name, recv_buffer[offset+i]); } opal_output(0, "\n"); - offset += OMPI_SPC_NUM_COUNTERS; + offset += buffer_len; + bin_offset += OMPI_SPC_NUM_COUNTERS; } - printf("###########################################################################\n"); - printf("NOTE: Any counters not shown here were either disabled or had a value of 0.\n"); - printf("###########################################################################\n"); + opal_output(0, "###########################################################################\n"); + opal_output(0, "NOTE: Any counters not shown here were either disabled or had a value of 0.\n"); + opal_output(0, "###########################################################################\n"); - free(recv_buffer); + free(recv_buffer); recv_buffer = NULL; } - free(send_buffer); + free(send_buffer); send_buffer = NULL; ompi_spc_comm->c_coll->coll_barrier(ompi_spc_comm, ompi_spc_comm->c_coll->coll_barrier_module); } @@ -405,20 +982,82 @@ static void ompi_spc_dump(void) /* Frees any dynamically alocated OMPI SPC data structures */ void ompi_spc_fini(void) { + int fd, rc; + char sm_file[SPC_MAX_FILENAME]; + char *shm_dir = SPC_SHM_DIR; + orte_proc_t *pptr; + + int rank = ompi_comm_rank(ompi_spc_comm); + if (SPC_ENABLE == 1 && ompi_mpi_spc_dump_enabled) { ompi_spc_dump(); } - free(ompi_spc_events); ompi_spc_events = NULL; - ompi_comm_free(&ompi_spc_comm); + int i; + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { + CLEAR_SPC_BIT(ompi_spc_attached_event, i); + CLEAR_SPC_BIT(ompi_spc_timer_event, i); + CLEAR_SPC_BIT(ompi_spc_bin_event, i); + CLEAR_SPC_BIT(ompi_spc_collective_bin_event, i); + } + if(need_free) { + free(ompi_spc_events); ompi_spc_events = NULL; + } + ompi_comm_free(&ompi_spc_comm); ompi_spc_comm = NULL; } /* Records an update to a counter using an atomic add operation. */ void ompi_spc_record(unsigned int event_id, ompi_spc_value_t value) { - /* Denoted unlikely because counters will often be turned off. */ - if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) { - OPAL_THREAD_ADD_FETCH_SIZE_T(&(ompi_spc_events[event_id].value), value); + OPAL_THREAD_ADD_FETCH_SIZE_T(&(ompi_spc_values[event_id]), value); +} + +/* Records an update to a bin counter using an atomic add operation. */ +void ompi_spc_bin_record(unsigned int event_id, ompi_spc_value_t value) +{ + int *rules; + ompi_spc_value_t *bins; + + /* Update the total number of times this counter has been triggered */ + OPAL_THREAD_ADD_FETCH_SIZE_T(&(ompi_spc_values[event_id]), 1); + rules = (int*)(ompi_spc_events+ompi_spc_offsets[event_id].rules_offset); + bins = (ompi_spc_value_t*)(ompi_spc_events+ompi_spc_offsets[event_id].bins_offset); + + int i, num_bins = rules[0]; + /* Update the appropriate bin */ + for(i = 1; i < num_bins; i++) { + if(value <= rules[i]) { + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[i-1]), 1); + return; + } + } + /* This didn't fall within any of the other bins, so it must belong to the last bin */ + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[num_bins-1]), 1); +} + +/* Records an update to a counter using an atomic add operation. */ +void ompi_spc_collective_bin_record(unsigned int event_id, ompi_spc_value_t bytes, ompi_spc_value_t procs) +{ + int *rules; + ompi_spc_value_t *bins; + + rules = (int*)(ompi_spc_events+ompi_spc_offsets[event_id].rules_offset); + bins = (ompi_spc_value_t*)(ompi_spc_events+ompi_spc_offsets[event_id].bins_offset); + + uint small_message = (bytes <= rules[1]); + uint small_comm = (procs <= rules[2]); + /* Always update the total number of times this collective algorithm was called */ + OPAL_THREAD_ADD_FETCH_SIZE_T(&(ompi_spc_values[event_id]), 1); + + /* Update the appropriate bin for the message size and number of processes */ + if(small_message && small_comm) { + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[0]), 1); + } else if(small_message && !small_comm) { + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[1]), 1); + } else if(!small_message && small_comm) { + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[2]), 1); + } else { + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[3]), 1); } } @@ -428,10 +1067,8 @@ void ompi_spc_record(unsigned int event_id, ompi_spc_value_t value) */ void ompi_spc_timer_start(unsigned int event_id, opal_timer_t *cycles) { - /* Check whether cycles == 0.0 to make sure the timer hasn't started yet. - * This is denoted unlikely because the counters will often be turned off. - */ - if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id) && *cycles == 0) ) { + /* Check whether cycles == 0.0 to make sure the timer hasn't started yet. */ + if( *cycles == 0 ) { *cycles = opal_timer_base_get_cycles(); } } @@ -442,11 +1079,8 @@ void ompi_spc_timer_start(unsigned int event_id, opal_timer_t *cycles) */ void ompi_spc_timer_stop(unsigned int event_id, opal_timer_t *cycles) { - /* This is denoted unlikely because the counters will often be turned off. */ - if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) { - *cycles = opal_timer_base_get_cycles() - *cycles; - OPAL_THREAD_ADD_FETCH_SIZE_T(&ompi_spc_events[event_id].value, (size_t) *cycles); - } + *cycles = opal_timer_base_get_cycles() - *cycles; + OPAL_THREAD_ADD_FETCH_SIZE_T(&ompi_spc_values[event_id], (ompi_spc_value_t) *cycles); } /* Checks a tag, and records the user version of the counter if it's greater @@ -460,19 +1094,23 @@ void ompi_spc_user_or_mpi(int tag, ompi_spc_value_t value, unsigned int user_enu /* Checks whether the counter denoted by value_enum exceeds the current value of the * counter denoted by watermark_enum, and if so sets the watermark_enum counter to the * value of the value_enum counter. + * + * WARNING: This assumes that this function was called while a lock has already been taken. + * This function is NOT thread safe otherwise! */ void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum) { - /* Denoted unlikely because counters will often be turned off. */ - if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, watermark_enum) && - IS_SPC_BIT_SET(ompi_spc_attached_event, value_enum)) ) { - /* WARNING: This assumes that this function was called while a lock has already been taken. - * This function is NOT thread safe otherwise! - */ - if(ompi_spc_events[value_enum].value > ompi_spc_events[watermark_enum].value) { - ompi_spc_events[watermark_enum].value = ompi_spc_events[value_enum].value; - } + if(ompi_spc_values[value_enum] > ompi_spc_values[watermark_enum]) { + ompi_spc_values[watermark_enum] = ompi_spc_values[value_enum]; + } +} + +ompi_spc_value_t ompi_spc_get_value(unsigned int event_id) +{ + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) { + return ompi_spc_values[event_id]; /* Note: this is not thread-safe */ } + return 0; } /* Converts a counter value that is in cycles to microseconds. diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 5d040511c34..878bed5245c 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 The University of Tennessee and The University + * Copyright (c) 2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2018 Research Organization for Information Science @@ -18,7 +18,12 @@ #include #include #include +#include #include +#include +#include /* For mode constants */ +#include /* For O_* constants */ +#include #include "ompi/communicator/communicator.h" #include "ompi/datatype/ompi_datatype.h" @@ -28,6 +33,13 @@ #include "opal/util/argv.h" #include "opal/util/show_help.h" #include "opal/util/output.h" +#include "opal/mca/shmem/base/base.h" +#include "opal/mca/pmix/pmix.h" + +#define PAGE_SIZE 4096 /* The number of bytes in a page. TODO: This should be found programatically */ +#define CACHE_LINE 8 /* The number of bytes in a cache line. TODO: This should be found programatically */ +#define SPC_MAX_FILENAME PATH_MAX /* The maximum length allowed for the spc file strings */ +#define SPC_SHM_DIR "/dev/shm" /* The default directory for shared memory files */ #include MCA_timer_IMPLEMENTATION_HEADER @@ -155,25 +167,85 @@ typedef enum ompi_spc_counters { OMPI_SPC_BYTES_GET, OMPI_SPC_UNEXPECTED, OMPI_SPC_OUT_OF_SEQUENCE, + OMPI_SPC_OOS_QUEUE_HOPS, OMPI_SPC_MATCH_TIME, + OMPI_SPC_MATCH_QUEUE_TIME, OMPI_SPC_UNEXPECTED_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE, OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, OMPI_SPC_MAX_OOS_IN_QUEUE, + OMPI_SPC_BASE_BCAST_LINEAR, + OMPI_SPC_BASE_BCAST_CHAIN, + OMPI_SPC_BASE_BCAST_PIPELINE, + OMPI_SPC_BASE_BCAST_SPLIT_BINTREE, + OMPI_SPC_BASE_BCAST_BINTREE, + OMPI_SPC_BASE_BCAST_BINOMIAL, + OMPI_SPC_BASE_REDUCE_CHAIN, + OMPI_SPC_BASE_REDUCE_PIPELINE, + OMPI_SPC_BASE_REDUCE_BINARY, + OMPI_SPC_BASE_REDUCE_BINOMIAL, + OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE, + OMPI_SPC_BASE_REDUCE_LINEAR, + OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING, + OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING, + OMPI_SPC_BASE_REDUCE_SCATTER_RING, + OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING, + OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING, + OMPI_SPC_BASE_ALLREDUCE_RING, + OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED, + OMPI_SPC_BASE_ALLREDUCE_LINEAR, + OMPI_SPC_BASE_SCATTER_BINOMIAL, + OMPI_SPC_BASE_SCATTER_LINEAR, + OMPI_SPC_BASE_GATHER_BINOMIAL, + OMPI_SPC_BASE_GATHER_LINEAR_SYNC, + OMPI_SPC_BASE_GATHER_LINEAR, + OMPI_SPC_BASE_ALLTOALL_INPLACE, + OMPI_SPC_BASE_ALLTOALL_PAIRWISE, + OMPI_SPC_BASE_ALLTOALL_BRUCK, + OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC, + OMPI_SPC_BASE_ALLTOALL_TWO_PROCS, + OMPI_SPC_BASE_ALLTOALL_LINEAR, + OMPI_SPC_BASE_ALLGATHER_BRUCK, + OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING, + OMPI_SPC_BASE_ALLGATHER_RING, + OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE, + OMPI_SPC_BASE_ALLGATHER_TWO_PROCS, + OMPI_SPC_BASE_ALLGATHER_LINEAR, + OMPI_SPC_BASE_BARRIER_DOUBLE_RING, + OMPI_SPC_BASE_BARRIER_RECURSIVE_DOUBLING, + OMPI_SPC_BASE_BARRIER_BRUCK, + OMPI_SPC_BASE_BARRIER_TWO_PROCS, + OMPI_SPC_BASE_BARRIER_LINEAR, + OMPI_SPC_BASE_BARRIER_TREE, + OMPI_SPC_P2P_MESSAGE_SIZE, + OMPI_SPC_EAGER_MESSAGES, + OMPI_SPC_NOT_EAGER_MESSAGES, + OMPI_SPC_QUEUE_ALLOCATION, OMPI_SPC_NUM_COUNTERS /* This serves as the number of counters. It must be last. */ } ompi_spc_counters_t; +extern uint32_t ompi_spc_attached_event[]; + /* There is currently no support for atomics on long long values so we will default to * size_t for now until support for such atomics is implemented. */ typedef opal_atomic_size_t ompi_spc_value_t; /* A structure for storing the event data */ -typedef struct ompi_spc_s{ +typedef struct ompi_spc_s { char *name; ompi_spc_value_t value; + int *bin_rules; /* The first element is the number of bins, the rest represent when each bin starts */ + ompi_spc_value_t *bins; } ompi_spc_t; +/* A structure for indexing into the event data */ +typedef struct ompi_spc_offset_s { + int num_bins; + int rules_offset; + int bins_offset; +} ompi_spc_offset_t; + /* Events data structure initialization function */ void ompi_spc_events_init(void); @@ -181,11 +253,15 @@ void ompi_spc_events_init(void); void ompi_spc_init(void); void ompi_spc_fini(void); void ompi_spc_record(unsigned int event_id, ompi_spc_value_t value); +void ompi_spc_bin_record(unsigned int event_id, ompi_spc_value_t value); +void ompi_spc_collective_bin_record(unsigned int event_id, ompi_spc_value_t bytes, ompi_spc_value_t procs); void ompi_spc_timer_start(unsigned int event_id, opal_timer_t *cycles); void ompi_spc_timer_stop(unsigned int event_id, opal_timer_t *cycles); void ompi_spc_user_or_mpi(int tag, ompi_spc_value_t value, unsigned int user_enum, unsigned int mpi_enum); void ompi_spc_cycles_to_usecs(ompi_spc_value_t *cycles); void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum); +ompi_spc_value_t ompi_spc_get_value(unsigned int event_id); +bool IS_SPC_BIT_SET(uint32_t* array, int32_t pos); /* Macros for using the SPC utility functions throughout the codebase. * If SPC_ENABLE is not 1, the macros become no-ops. @@ -199,22 +275,38 @@ void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_e ompi_spc_fini() #define SPC_RECORD(event_id, value) \ - ompi_spc_record(event_id, value) + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) \ + ompi_spc_record(event_id, value) + +#define SPC_BIN_RECORD(event_id, value) \ + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) \ + ompi_spc_bin_record(event_id, value) + +#define SPC_COLL_BIN_RECORD(event_id, bytes, procs) \ + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) \ + ompi_spc_collective_bin_record(event_id, bytes, procs) #define SPC_TIMER_START(event_id, usec) \ - ompi_spc_timer_start(event_id, usec) + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) \ + ompi_spc_timer_start(event_id, usec) #define SPC_TIMER_STOP(event_id, usec) \ - ompi_spc_timer_stop(event_id, usec) + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) \ + ompi_spc_timer_stop(event_id, usec) #define SPC_USER_OR_MPI(tag, value, enum_if_user, enum_if_mpi) \ - ompi_spc_user_or_mpi(tag, value, enum_if_user, enum_if_mpi) + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, enum_if_user) && IS_SPC_BIT_SET(ompi_spc_attached_event, enum_if_mpi)) ) \ + ompi_spc_user_or_mpi(tag, value, enum_if_user, enum_if_mpi) #define SPC_CYCLES_TO_USECS(cycles) \ ompi_spc_cycles_to_usecs(cycles) #define SPC_UPDATE_WATERMARK(watermark_enum, value_enum) \ - ompi_spc_update_watermark(watermark_enum, value_enum) + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, watermark_enum) && IS_SPC_BIT_SET(ompi_spc_attached_event, value_enum)) ) \ + ompi_spc_update_watermark(watermark_enum, value_enum) + +#define SPC_GET(event_id) \ + ompi_spc_get_value(event_id) #else /* SPCs are not enabled */ @@ -227,6 +319,12 @@ void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_e #define SPC_RECORD(event_id, value) \ ((void)0) +#define SPC_BIN_RECORD(event_id, value) \ + ((void)0) + +#define SPC_COLL_BIN_RECORD(event_id, bytes, procs) \ + ((void)0) + #define SPC_TIMER_START(event_id, usec) \ ((void)0) @@ -242,6 +340,9 @@ void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_e #define SPC_UPDATE_WATERMARK(watermark_enum, value_enum) \ ((void)0) +#define SPC_GET(event_id) \ + ((void)0) + #endif #endif diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index 194ac060da1..302a03e3be5 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2018 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -148,12 +148,49 @@ OMPI_DECLSPEC extern bool ompi_async_mpi_finalize; */ OMPI_DECLSPEC extern char * ompi_mpi_spc_attach_string; +/** + * A string to append to the SPC XML files for using the mmap interface. + * This is to make the filename easier to identify. + */ +OMPI_DECLSPEC extern char * ompi_mpi_spc_xml_string; + /** * A boolean value that determines whether or not to dump the SPC counter * values in MPI_Finalize. A value of true dumps the counters and false does not. */ OMPI_DECLSPEC extern bool ompi_mpi_spc_dump_enabled; +/** + * A boolean value that determines whether or not to dump the SPC counter + * values in an mmap'd file during execution. A value of true dumps the + * counters and false does not. + */ +OMPI_DECLSPEC extern bool ompi_mpi_spc_mmap_enabled; + +/** + * An integer value that denotes the time period between snapshots with the + * SPC mmap interface. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_snapshot_period; + +/** + * An integer value that denotes the boundary at which a message is qualified + * as a small/large message for the point to point message counter. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_p2p_message_boundary; + +/** + * An integer value that denotes the boundary at which a message is qualified + * as a small/large message for collective bin counters. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_collective_message_boundary; + +/** + * An integer value that denotes the boundary at which a communicator is qualified + * as a small/large communicator for collective bin counters. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_collective_comm_boundary; + /** * Register MCA parameters used by the MPI layer. From 53a627eae3e5a08a8a896fc813ba355850be5229 Mon Sep 17 00:00:00 2001 From: David Eberius Date: Thu, 23 Jan 2020 18:36:28 -0500 Subject: [PATCH 2/3] This update introduces changes to how watermark counters are handled, adds new counters for internal queues such as counters for data usage and timers for out-of-sequence messages, and adds example code and documentation for SPCs. The watermark counter update reduces function call overhead by better merging the updates to the sentinel value and the watermark value. There are now three SPC example codes, one for accessing SPCs through MPI_T, one for accessing SPCs through the mmap interface, and one for parsing SPC snapshot datafiles and producing a heatmap plot of the counter value differences over time. This also adds documentation for SPCs in the form of a markdown file in the same directory as the SPC driver code. Signed-off-by: David Eberius --- examples/Makefile | 9 +- examples/spc_mmap_example.c | 222 +++++ .../{spc_example.c => spc_mpit_example.c} | 81 +- examples/spc_snapshot_parse.py | 170 ++++ ompi/mca/pml/ob1/pml_ob1_recvfrag.c | 54 +- ompi/mca/pml/ob1/pml_ob1_recvfrag.h | 3 +- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 14 + ompi/runtime/ompi_mpi_params.c | 67 +- ompi/runtime/ompi_spc.c | 242 +++-- ompi/runtime/ompi_spc.h | 111 ++- ompi/runtime/ompi_spc_documentation.md | 829 ++++++++++++++++++ ompi/runtime/params.h | 54 +- 12 files changed, 1643 insertions(+), 213 deletions(-) create mode 100644 examples/spc_mmap_example.c rename examples/{spc_example.c => spc_mpit_example.c} (53%) create mode 100755 examples/spc_snapshot_parse.py create mode 100644 ompi/runtime/ompi_spc_documentation.md diff --git a/examples/Makefile b/examples/Makefile index 5d32bb00973..f6699ded452 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -65,14 +65,15 @@ EXAMPLES = \ oshmem_max_reduction \ oshmem_strided_puts \ oshmem_symmetric_data \ - spc_example + spc_mpit_example \ + spc_mmap_example # Default target. Always build the C MPI examples. Only build the # others if we have the appropriate Open MPI / OpenSHMEM language # bindings. -all: hello_c ring_c connectivity_c spc_example +all: hello_c ring_c connectivity_c spc_mpit_example spc_mmap_example @ if which ompi_info >/dev/null 2>&1 ; then \ $(MAKE) mpi; \ fi @@ -127,7 +128,9 @@ ring_c: ring_c.c $(MPICC) $(CFLAGS) $(LDFLAGS) $? $(LDLIBS) -o $@ connectivity_c: connectivity_c.c $(MPICC) $(CFLAGS) $(LDFLAGS) $? $(LDLIBS) -o $@ -spc_example: spc_example.c +spc_mpit_example: spc_mpit_example.c + $(MPICC) $(CFLAGS) $(LDFLAGS) $? $(LDLIBS) -o $@ +spc_mmap_example: spc_mmap_example.c $(MPICC) $(CFLAGS) $(LDFLAGS) $? $(LDLIBS) -o $@ hello_mpifh: hello_mpifh.f diff --git a/examples/spc_mmap_example.c b/examples/spc_mmap_example.c new file mode 100644 index 00000000000..95b94a5b50b --- /dev/null +++ b/examples/spc_mmap_example.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Simple example usage of SPCs through an mmap'd file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* This structure will help us store all of the offsets for each + * counter that we want to print out. + */ +typedef struct spc_s { + char name[128]; + int offset; + int rules_offset; + int bins_offset; +} spc_t; + +int main(int argc, char **argv) +{ + if(argc < 4) { + printf("Usage: ./spc_mmap_test [num_messages] [message_size] [XML string]\n"); + return -1; + } + + MPI_Init(NULL, NULL); + + int i, num_messages = atoi(argv[1]), message_size = atoi(argv[2]), rank, shm_fd; + char *buf = (char*)malloc(message_size * sizeof(char)); + + MPI_Request *requests = (MPI_Request*)malloc(num_messages * sizeof(MPI_Request)); + MPI_Status *statuses = (MPI_Status*)malloc(num_messages * sizeof(MPI_Status)); + MPI_Status status; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int retval, shm_file_size, num_counters, freq_mhz; + long long value; + char filename[128], shm_filename[128], line[128], *token; + + char hostname[128]; + gethostname(hostname, 128); + + char *nodename; + nodename = strtok(hostname, "."); + + char *xml_string = argv[3]; + snprintf(filename, 128, "/dev/shm/spc_data.%s.%s.%d.xml", nodename, xml_string, rank); + + FILE *fptr = NULL; + void *data_ptr; + spc_t *spc_data; + + if(NULL == (fptr = fopen(filename, "r"))) { + printf("Couldn't open xml file.\n"); + MPI_Finalize(); + return -1; + } else { + printf("[%d] Successfully opened the XML file!\n", rank); + } + + /* The following is to read the formatted XML file to get the basic + * information we need to read the shared memory file and properly + * format some counters. + */ + char tmp_filename[128]; + fgets(line, 128, fptr); + fgets(line, 128, fptr); + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%s", shm_filename); + + if(rank == 0) { + printf("shm_filename: %s\n", shm_filename); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &shm_file_size); + if(rank == 0) { + printf("shm_file_size: %d\n", shm_file_size); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &num_counters); + if(rank == 0) { + printf("num_counters: %d\n", num_counters); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &freq_mhz); + if(rank == 0) { + printf("freq_mhz: %d\n", freq_mhz); + } + + if(-1 == (shm_fd = open(shm_filename, O_RDONLY))){ + printf("\nCould not open file '%s'... Error String: %s\n", shm_filename, strerror(errno)); + return -1; + } else { + if(MAP_FAILED == (data_ptr = mmap(0, 8192, PROT_READ, MAP_SHARED, shm_fd, 0))) { + printf("Map failed :(\n"); + return -1; + } + printf("Successfully mmap'd file!\n"); + } + + spc_data = (spc_t*)malloc(num_counters * sizeof(spc_t)); + + for(i = 0; i < num_counters; i++) { + fgets(line, 128, fptr); /* Counter begin header */ + /* This should never happen... */ + if(strcmp(line,"\n") == 0) { + printf("Parsing ended prematurely. There weren't enough counters.\n"); + break; + } + + fgets(line, 128, fptr); /* Counter name header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%s", spc_data[i].name); /* Counter name */ + + fgets(line, 128, fptr); /* Counter value offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].offset); /* Counter offset */ + + fgets(line, 128, fptr); /* Counter rules offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].rules_offset); /* Counter rules offset */ + + fgets(line, 128, fptr); /* Counter bins offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].bins_offset); /* Counter bins offset */ + + fgets(line, 128, fptr); /* Counter end header */ + } + + fclose(fptr); + + /* The following communication pattern is intended to cause a certain + * number of unexpected messages. + */ + if(rank==0) { + for(i=num_messages; i > 0; i--) { + MPI_Isend(buf, message_size, MPI_BYTE, 1, i, MPI_COMM_WORLD, &requests[i-1]); + } + MPI_Send(buf, message_size, MPI_BYTE, 1, 0, MPI_COMM_WORLD); + MPI_Waitall(num_messages, requests, statuses); + + MPI_Barrier(MPI_COMM_WORLD); + + for(i = 0; i < num_counters; i++) { + if((0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_TIME")) || (0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_QUEUE_TIME"))) { + value = (*((long long*)(data_ptr+spc_data[i].offset))) / freq_mhz; + } else { + value = *((long long*)(data_ptr+spc_data[i].offset)); + } + if(value > 0) + printf("[%d] %s\t%lld\n", rank, spc_data[i].name, value ); + } + MPI_Barrier(MPI_COMM_WORLD); + } else { + MPI_Recv(buf, message_size, MPI_BYTE, 0, 0, MPI_COMM_WORLD, &status); + for(i=0; i < num_messages; i++) { + MPI_Recv(buf, message_size, MPI_BYTE, 0, i+1, MPI_COMM_WORLD, &statuses[i]); + } + + MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + for(i = 0; i < num_counters; i++) { + /* These counters are stored in cycles, so we convert them to microseconds. + */ + if((0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_TIME")) || (0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_QUEUE_TIME"))) { + value = (*((long long*)(data_ptr+spc_data[i].offset))) / freq_mhz; + } else { + value = *((long long*)(data_ptr+spc_data[i].offset)); + } + if(value > 0) { + printf("[%d] %s\t%lld\n", rank, spc_data[i].name, value ); + if(spc_data[i].rules_offset > 0) { + int j, *rules = (int*)(data_ptr+spc_data[i].rules_offset); + long long *bins = (long long*)(data_ptr+spc_data[i].bins_offset); + + for(j = 0; j < rules[0]; j++) { + if(j == rules[0]-1) { + printf("\t> %d\t", rules[j]); + } + else { + printf("\t<= %d\t", rules[j+1]); + } + printf("%lld\n", bins[j]); + } + } + } + } + } + + MPI_Finalize(); + + return 0; +} diff --git a/examples/spc_example.c b/examples/spc_mpit_example.c similarity index 53% rename from examples/spc_example.c rename to examples/spc_mpit_example.c index b7b09c35690..eb1d7d589f4 100644 --- a/examples/spc_example.c +++ b/examples/spc_mpit_example.c @@ -22,16 +22,33 @@ void message_exchange(int num_messages, int message_size) /* Use calloc to initialize data to 0's */ char *data = (char*)calloc(message_size, sizeof(char)); MPI_Status status; + MPI_Request req; MPI_Comm_rank(MPI_COMM_WORLD, &rank); + /* This is designed to have at least num_messages unexpected messages in order to + * hit the unexpected message queue counters. The broadcasts are here to showcase + * the collective bin counters and the P2P and Eager message counters. + */ if(rank == 0) { - for(i = 0; i < num_messages; i++) - MPI_Send(data, message_size, MPI_BYTE, 1, 123, MPI_COMM_WORLD); + for(i = 0; i < num_messages; i++) { + MPI_Isend(data, message_size, MPI_BYTE, 1, 123, MPI_COMM_WORLD, &req); + } + MPI_Send(data, message_size, MPI_BYTE, 1, 321, MPI_COMM_WORLD); + for(i = 0; i < num_messages; i++) { + MPI_Bcast(data, message_size, MPI_BYTE, 0, MPI_COMM_WORLD); + } } else if(rank == 1) { - for(i = 0; i < num_messages; i++) + MPI_Recv(data, message_size, MPI_BYTE, 0, 321, MPI_COMM_WORLD, &status); + for(i = 0; i < num_messages; i++) { MPI_Recv(data, message_size, MPI_BYTE, 0, 123, MPI_COMM_WORLD, &status); + } + for(i = 0; i < num_messages; i++) { + MPI_Bcast(data, message_size, MPI_BYTE, 0, MPI_COMM_WORLD); + } } + /* This should use the binomial algorithm so it has at least one counter value */ + MPI_Bcast(data, 1, MPI_BYTE, 0, MPI_COMM_WORLD); free(data); } @@ -46,16 +63,20 @@ int main(int argc, char **argv) } else { num_messages = atoi(argv[1]); message_size = atoi(argv[2]); + if(message_size <= 0) { + printf("Message size must be positive.\n"); + return -1; + } } - int i, rank, size, provided, num, name_len, desc_len, verbosity, bind, var_class, readonly, continuous, atomic, count, index, xml_index; + int i, j, rank, size, provided, num, name_len, desc_len, verbosity, bind, var_class, readonly, continuous, atomic, count, index, xml_index; MPI_Datatype datatype; MPI_T_enum enumtype; MPI_Comm comm; char name[256], description[256]; /* Counter names to be read by ranks 0 and 1 */ - char *counter_names[] = {"runtime_spc_OMPI_SPC_BYTES_SENT_USER", + char *counter_names[] = {"runtime_spc_OMPI_SPC_BASE_BCAST_BINOMIAL", "runtime_spc_OMPI_SPC_BYTES_RECEIVED_USER" }; char *xml_counter = "runtime_spc_OMPI_SPC_XML_FILE"; @@ -69,7 +90,26 @@ int main(int argc, char **argv) MPI_Abort(MPI_COMM_WORLD, -1); } - /* Determine the MPI_T pvar indices for the OMPI_BYTES_SENT/RECIEVED_USER SPCs */ + if(rank == 0) { + printf("##################################################################\n"); + printf("This test is designed to highlight several different SPC counters.\n"); + printf("The MPI workload of this test will use 1 MPI_Send and %d MPI_Isend\n", num_messages); + printf("operation(s) on the sender side (rank 0) and %d MPI_Recv operation(s)\n", num_messages+1); + printf("on the receiver side (rank 1) in such a way that at least %d message(s)\n", num_messages); + printf("are unexpected. This highlights the unexpected message queue SPCs.\n"); + printf("There will also be %d MPI_Bcast operation(s) with one of them being of\n", num_messages+1); + printf("size 1 byte, and %d being of size %d byte(s). The 1 byte MPI_Bcast is\n", num_messages, message_size); + printf("meant to ensure that there is at least one MPI_Bcast that uses the\n"); + printf("binomial algorithm so the MPI_T pvar isn't all 0's. The addition of\n"); + printf("the broadcasts also has the effect of showcasing the P2P message size,\n"); + printf("eager vs not eager message, and bytes sent by the user vs MPI SPCs.\n"); + printf("Be sure to set the mpi_spc_dump_enabled MCA parameter to true in order\n"); + printf("to see all of the tracked SPCs.\n"); + printf("##################################################################\n\n"); + } + MPI_Barrier(MPI_COMM_WORLD); + + /* Determine the MPI_T pvar indices for the requested SPCs */ index = xml_index = -1; MPI_T_pvar_get_num(&num); for(i = 0; i < num; i++) { @@ -80,14 +120,14 @@ int main(int argc, char **argv) if( MPI_SUCCESS != rc ) continue; + if(strcmp(name, xml_counter) == 0) { + xml_index = i; + printf("[%d] %s -> %s\n", rank, name, description); + } if(strcmp(name, counter_names[rank]) == 0) { index = i; printf("[%d] %s -> %s\n", rank, name, description); } - if(strcmp(name, xml_counter) == 0) { - xml_index = i; - printf("[%d] %s -> %s (index -> %d)\n", rank, name, description, xml_index); - } } /* Make sure we found the counters */ @@ -97,9 +137,8 @@ int main(int argc, char **argv) } int ret, xml_count; - long long value; - char *xml_filename = (char*)malloc(64 * sizeof(char)); - sprintf(xml_filename, "this_is_a_test"); + long long *values = NULL; + char *xml_filename = (char*)malloc(128 * sizeof(char)); MPI_T_pvar_session session; MPI_T_pvar_handle handle; @@ -108,28 +147,36 @@ int main(int argc, char **argv) ret = MPI_T_pvar_handle_alloc(session, index, NULL, &handle, &count); ret = MPI_T_pvar_start(session, handle); + values = (long long*)malloc(count * sizeof(long long)); + MPI_T_pvar_session xml_session; MPI_T_pvar_handle xml_handle; if(xml_index >= 0) { ret = MPI_T_pvar_session_create(&xml_session); ret = MPI_T_pvar_handle_alloc(xml_session, xml_index, NULL, &xml_handle, &xml_count); - printf("xml_count: %d\n", xml_count); ret = MPI_T_pvar_start(xml_session, xml_handle); } + double timer = MPI_Wtime(); message_exchange(num_messages, message_size); + timer = MPI_Wtime() - timer; - ret = MPI_T_pvar_read(session, handle, &value); + printf("[%d] Elapsed time: %lf seconds\n", rank, timer); + + ret = MPI_T_pvar_read(session, handle, values); if(xml_index >= 0) { ret = MPI_T_pvar_read(xml_session, xml_handle, &xml_filename); } /* Print the counter values in order by rank */ for(i = 0; i < 2; i++) { + printf("\n"); if(i == rank) { - printf("[%d] Value Read: %lld\n", rank, value); if(xml_index >= 0) { - printf("[%d] Value Read: %s\n", rank, xml_filename); + printf("[%d] XML Counter Value Read: %s\n", rank, xml_filename); + } + for(j = 0; j < count; j++) { + printf("[%d] %s Counter Value Read: %lld\n", rank, counter_names[rank], values[j]); } fflush(stdout); } diff --git a/examples/spc_snapshot_parse.py b/examples/spc_snapshot_parse.py new file mode 100755 index 00000000000..40d6f26b8e3 --- /dev/null +++ b/examples/spc_snapshot_parse.py @@ -0,0 +1,170 @@ +#!/usr/bin/python + +import sys +import glob +import operator +import struct + +import numpy as np +import matplotlib +matplotlib.use('Agg') # For use with headless systems +import matplotlib.pyplot as plt +import matplotlib.cm as cm +import matplotlib.ticker as ticker + +def combine(filename, data): + f = open(filename, 'rb') + for i in range(0,num_counters): + temp = struct.unpack('l', f.read(8))[0] + if 'TIME' in names[i]: + temp /= freq_mhz + data[i].append(temp) + +def fmt(x, pos): + return '{:,.0f}'.format(x) + +# Make sure the proper number of arguments have been supplied +if len(sys.argv) < 4: + print("Usage: ./spc_snapshot_parse.py [/path/to/data/files] [datafile_label] [list_of_spcs]") + exit() + +path = sys.argv[1] +label = sys.argv[2] + +xml_filename = '' +# Lists for storing the snapshot data files from each rank +copies = [] +ends = [] +# Populate the lists with the appropriate data files +for filename in glob.glob(path + "/spc_data*"): + if label in filename: + if xml_filename == '' and '.xml' in filename: + xml_filename = filename + if '.xml' not in filename: + temp = filename.split('/')[-1].split('.') + if len(temp) < 5: + temp[-1] = int(temp[-1]) + ends.append(temp) + else: + temp[-1] = int(temp[-1]) + temp[-2] = int(temp[-2]) + copies.append(temp) + +# Sort the lists +ends = sorted(ends, key = operator.itemgetter(-1)) +for i in range(0,len(ends)): + ends[i][-1] = str(ends[i][-1]) +copies = sorted(copies, key = operator.itemgetter(-2,-1)) +for i in range(0,len(copies)): + copies[i][-1] = str(copies[i][-1]) + copies[i][-2] = str(copies[i][-2]) + +sep = '.' + +xml_file = open(xml_filename, 'r') +num_counters = 0 +freq_mhz = 0 +names = [] +base = [] +# Parse the XML file (same for all data files) +for line in xml_file: + if 'num_counters' in line: + num_counters = int(line.split('>')[1].split('<')[0]) + if 'freq_mhz' in line: + freq_mhz = int(line.split('>')[1].split('<')[0]) + if '' in line: + names.append(line.split('>')[1].split('<')[0]) + value = [names[-1]] + base.append(value) + +prev = copies[0] +i = 0 +ranks = [] +values = [] +times = [] +time = [] + +# Populate the data lists +for n in range(0,len(base)): + values.append([0, names[n]]) +for c in copies: + if c[-2] != prev[-2]: + filename = path + "/" + sep.join(ends[i]) + combine(filename, values) + + ranks.append(values) + times.append(time) + for j in range(0, len(names)): + temp = [ranks[0][j][0]] + + values = [] + time = [] + for n in range(0,len(base)): + values.append([i+1, names[n]]) + i += 1 + + filename = path + "/" + sep.join(c) + time.append(int(filename.split('.')[-1])) + combine(filename, values) + prev = c + +filename = path + "/" + sep.join(ends[i]) +combine(filename, values) +ranks.append(values) +times.append(time) + +spc_list = sys.argv[3].split(",") + +for i in range(0, len(names)): + fig = plt.figure(num=None, figsize=(7, 9), dpi=200, facecolor='w', edgecolor='k') + + plot = False + # Only plot the SPCs of interest + if names[i] in spc_list: + plot = True + + map_data = [] + avg_x = [] + + for j in range(0, len(ranks)): + if avg_x == None: + avg_x = np.zeros(len(times[j])-1) + empty = True + for k in range(2,len(ranks[j][i])): + if ranks[j][i][k] != 0: + empty = False + break + if not empty: + if plot: + xvals = [] + yvals = [] + for l in range(1, len(times[j])): + if ranks[j][i][l+2] - ranks[j][i][l+1] < 0: + break + xvals.append(times[j][l] - times[j][0]) + yvals.append(ranks[j][i][l+2] - ranks[j][i][l+1]) + + map_data.append(yvals) + for v in range(0,len(avg_x)): + avg_x[v] += xvals[v] + if plot: + for v in range(0,len(avg_x)): + avg_x[v] /= float(len(ranks)) + + ax = plt.gca() + im = ax.imshow(map_data, cmap='Reds', interpolation='nearest') + + cbar = ax.figure.colorbar(im, ax=ax, format=ticker.FuncFormatter(fmt)) + cbar.ax.set_ylabel("Counter Value", rotation=-90, va="bottom") + + plt.title(names[i] + ' Snapshot Difference') + + plt.xlabel('Time') + plt.ylabel('MPI Rank') + + ax.set_xticks(np.arange(len(avg_x))) + ax.set_yticks(np.arange(len(map_data))) + ax.set_xticklabels(avg_x) + + plt.show() + fig.savefig(names[i] + '.png') diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index af8eeab8fdb..256460dddd4 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -410,9 +410,15 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, MCA_PML_OB1_RECV_FRAG_ALLOC(frag); MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl); append_frag_to_ordered_list(&proc->frags_cant_match, frag, proc->expected_sequence); +#if SPC_ENABLE == 1 + size_t total_data = segments[0].seg_len; + for(int i = 1; i < num_segments; i++) { + total_data += segments[i].seg_len; + } + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_QUEUE_DATA, OMPI_SPC_OOS_QUEUE_DATA, total_data); +#endif SPC_RECORD(OMPI_SPC_OUT_OF_SEQUENCE, 1); - SPC_RECORD(OMPI_SPC_OOS_IN_QUEUE, 1); - SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE); + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE, 1); OB1_MATCHING_UNLOCK(&comm->matching_lock); return; } @@ -508,13 +514,20 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, mca_pml_ob1_recv_frag_t* frag; OB1_MATCHING_LOCK(&comm->matching_lock); +#if SPC_ENABLE == 1 + opal_timer_t timer; + timer = 0; +#endif + SPC_TIMER_START(OMPI_SPC_OOS_MATCH_TIME, &timer); if((frag = check_cantmatch_for_match(proc))) { + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_TIME, &timer); /* mca_pml_ob1_recv_frag_match_proc() will release the lock. */ mca_pml_ob1_recv_frag_match_proc(frag->btl, comm_ptr, proc, &frag->hdr.hdr_match, frag->segments, frag->num_segments, frag->hdr.hdr_match.hdr_common.hdr_type, frag); } else { + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_TIME, &timer); OB1_MATCHING_UNLOCK(&comm->matching_lock); } } @@ -878,9 +891,17 @@ match_one(mca_btl_base_module_t *btl, num_segments, frag); #endif SPC_TIMER_STOP(OMPI_SPC_MATCH_QUEUE_TIME, &queue_timer); + +#if SPC_ENABLE == 1 + size_t total_data = segments[0].seg_len; + for(int i = 1; i < num_segments; i++) { + total_data += segments[i].seg_len; + } + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_UNEXPECTED_QUEUE_DATA, OMPI_SPC_UNEXPECTED_QUEUE_DATA, total_data); +#endif + SPC_RECORD(OMPI_SPC_UNEXPECTED, 1); - SPC_RECORD(OMPI_SPC_UNEXPECTED_IN_QUEUE, 1); - SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, OMPI_SPC_UNEXPECTED_IN_QUEUE); + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, OMPI_SPC_UNEXPECTED_IN_QUEUE, 1); PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); @@ -968,6 +989,12 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, */ OB1_MATCHING_LOCK(&comm->matching_lock); +#if SPC_ENABLE == 1 + opal_timer_t timer; + timer = 0; +#endif + SPC_TIMER_START(OMPI_SPC_OOS_MATCH_QUEUE_TIME, &timer); + frag_msg_seq = hdr->hdr_seq; next_msg_seq_expected = (uint16_t)proc->expected_sequence; @@ -978,15 +1005,23 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, MCA_PML_OB1_RECV_FRAG_ALLOC(frag); MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl); append_frag_to_ordered_list(&proc->frags_cant_match, frag, next_msg_seq_expected); + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_QUEUE_TIME, &timer); +#if SPC_ENABLE == 1 + size_t total_data = segments[0].seg_len; + for(int i = 1; i < num_segments; i++) { + total_data += segments[i].seg_len; + } + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_QUEUE_DATA, OMPI_SPC_OOS_QUEUE_DATA, total_data); +#endif SPC_RECORD(OMPI_SPC_OUT_OF_SEQUENCE, 1); - SPC_RECORD(OMPI_SPC_OOS_IN_QUEUE, 1); - SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE); + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE, 1); OB1_MATCHING_UNLOCK(&comm->matching_lock); return OMPI_SUCCESS; } } + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_QUEUE_TIME, &timer); /* mca_pml_ob1_recv_frag_match_proc() will release the lock. */ return mca_pml_ob1_recv_frag_match_proc(btl, comm_ptr, proc, hdr, @@ -1070,7 +1105,13 @@ mca_pml_ob1_recv_frag_match_proc( mca_btl_base_module_t *btl, */ if(OPAL_UNLIKELY(NULL != proc->frags_cant_match)) { OB1_MATCHING_LOCK(&comm->matching_lock); +#if SPC_ENABLE == 1 + opal_timer_t timer; + timer = 0; +#endif + SPC_TIMER_START(OMPI_SPC_OOS_MATCH_TIME, &timer); if((frag = check_cantmatch_for_match(proc))) { + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_TIME, &timer); hdr = &frag->hdr.hdr_match; segments = frag->segments; num_segments = frag->num_segments; @@ -1078,6 +1119,7 @@ mca_pml_ob1_recv_frag_match_proc( mca_btl_base_module_t *btl, type = hdr->hdr_common.hdr_type; goto match_this_frag; } + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_TIME, &timer); OB1_MATCHING_UNLOCK(&comm->matching_lock); } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h index 941471cdd39..6d82fc8488b 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h @@ -80,13 +80,14 @@ do { \ macro_segments[0].seg_addr.pval = frag->addr; \ } else { \ buffers[0].len = _size; \ + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_QUEUE_ALLOCATION, \ + OMPI_SPC_QUEUE_ALLOCATION, _size); \ buffers[0].addr = (char*) \ mca_pml_ob1.allocator->alc_alloc( mca_pml_ob1.allocator, \ buffers[0].len, \ 0); \ _ptr = (unsigned char*)(buffers[0].addr); \ macro_segments[0].seg_addr.pval = buffers[0].addr; \ - SPC_RECORD(OMPI_SPC_QUEUE_ALLOCATION, buffers[0].len); \ } \ macro_segments[0].seg_len = _size; \ for( i = 0; i < cnt; i++ ) { \ diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 8588b9a5879..d2ec677dfee 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -1331,6 +1331,13 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) #else opal_list_remove_item(&proc->unexpected_frags, (opal_list_item_t*)frag); +#endif +#if SPC_ENABLE == 1 + size_t total_data = frag->segments[0].seg_len; + for(int i = 1; i < frag->num_segments; i++) { + total_data += frag->segments[i].seg_len; + } + SPC_RECORD(OMPI_SPC_UNEXPECTED_QUEUE_DATA, -total_data); #endif SPC_RECORD(OMPI_SPC_UNEXPECTED_IN_QUEUE, -1); OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); @@ -1367,6 +1374,13 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) #else opal_list_remove_item(&proc->unexpected_frags, (opal_list_item_t*)frag); +#endif +#if SPC_ENABLE == 1 + size_t total_data = frag->segments[0].seg_len; + for(int i = 1; i < frag->num_segments; i++) { + total_data += frag->segments[i].seg_len; + } + SPC_RECORD(OMPI_SPC_UNEXPECTED_QUEUE_DATA, -total_data); #endif SPC_RECORD(OMPI_SPC_UNEXPECTED_IN_QUEUE, -1); OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index befa28f2918..27e346cb673 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -75,15 +75,6 @@ bool ompi_async_mpi_finalize = false; uint32_t ompi_add_procs_cutoff = OMPI_ADD_PROCS_CUTOFF_DEFAULT; bool ompi_mpi_dynamics_enabled = true; -char *ompi_mpi_spc_attach_string = NULL; -char *ompi_mpi_spc_xml_string = NULL; -bool ompi_mpi_spc_dump_enabled = false; -bool ompi_mpi_spc_mmap_enabled = false; -int ompi_mpi_spc_snapshot_period = 0; -int ompi_mpi_spc_p2p_message_boundary = 12288; -int ompi_mpi_spc_collective_message_boundary = 12288; -int ompi_mpi_spc_collective_comm_boundary = 64; - static bool show_default_mca_params = false; static bool show_file_mca_params = false; static bool show_enviro_mca_params = false; @@ -330,61 +321,9 @@ int ompi_mpi_register_params(void) MCA_BASE_VAR_SYN_FLAG_DEPRECATED); } - ompi_mpi_spc_attach_string = NULL; - (void) mca_base_var_register("ompi", "mpi", NULL, "spc_attach", - "A comma delimeted string listing the software-based performance counters (SPCs) to enable.", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_4, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_mpi_spc_attach_string); - - ompi_mpi_spc_xml_string = NULL; - (void) mca_base_var_register("ompi", "mpi", NULL, "spc_xml_string", - "A string to add to SPC XML files for easier identification. The format will be: spc_data.[nodename].[jobid or spc_xml_string].[world_rank].xml", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_4, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_mpi_spc_xml_string); - - ompi_mpi_spc_dump_enabled = false; - (void) mca_base_var_register("ompi", "mpi", NULL, "spc_dump_enabled", - "A boolean value for whether (true) or not (false) to enable dumping SPC counters in MPI_Finalize.", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_4, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_mpi_spc_dump_enabled); - - ompi_mpi_spc_mmap_enabled = false; - (void) mca_base_var_register("ompi", "mpi", NULL, "spc_mmap_enabled", - "A boolean value for whether (true) or not (false) to enable dumping SPC counters to an mmap'd file.", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_4, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_mpi_spc_mmap_enabled); - - ompi_mpi_spc_p2p_message_boundary = 12288; - (void) mca_base_var_register("ompi", "mpi", NULL, "spc_p2p_message_boundary", - "An integer value for determining the boundary for whether a message is small/large for point to point message size bin counter (<= this value is small).", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_4, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_mpi_spc_p2p_message_boundary); - - ompi_mpi_spc_collective_message_boundary = 12288; - (void) mca_base_var_register("ompi", "mpi", NULL, "spc_message_boundary", - "An integer value for determining the boundary for whether a message is small/large for collective bin counters (<= this value is small).", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_4, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_mpi_spc_collective_message_boundary); - - ompi_mpi_spc_collective_comm_boundary = 64; - (void) mca_base_var_register("ompi", "mpi", NULL, "spc_comm_boundary", - "An integer value for determining the boundary for whether a communicator is small/large for collective bin counters (<= this value is small).", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_4, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_mpi_spc_collective_comm_boundary); +#if SPC_ENABLE == 1 + (void) ompi_spc_register_params(); +#endif return OMPI_SUCCESS; } diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index 360431fbc94..48b12efc5f8 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -17,24 +17,38 @@ #include "ompi_spc.h" -opal_timer_t sys_clock_freq_mhz = 0; - -static void ompi_spc_dump(void); - +static opal_timer_t sys_clock_freq_mhz = 0; static int mpi_t_offset = -1; static bool mpi_t_enabled = false; static bool spc_enabled = true; static bool need_free = false; - +static bool mmap_failed = false; static ompi_communicator_t *ompi_spc_comm = NULL; +/* MCA Parameter Variables */ +char *ompi_mpi_spc_attach_string = NULL; +char *ompi_mpi_spc_xml_string = NULL; +bool ompi_mpi_spc_dump_enabled = false; +bool ompi_mpi_spc_mmap_enabled = false; +int ompi_mpi_spc_snapshot_period = 0; +int ompi_mpi_spc_p2p_message_boundary = 12288; +int ompi_mpi_spc_collective_message_boundary = 12288; +int ompi_mpi_spc_collective_comm_boundary = 64; + typedef struct ompi_spc_event_t { const char* counter_name; const char* counter_description; } ompi_spc_event_t; +static void ompi_spc_dump(void); + #define SET_COUNTER_ARRAY(NAME, DESC) [NAME] = { .counter_name = #NAME, .counter_description = DESC } +/* STEP 2: Add the counter descriptions for new counters here along with their + * enumeration to be converted to a name value. NOTE: The names and + * descriptions MUST be in the same array location as where you added + * the counter name in the ompi_spc_counters_t enumeration! + */ static ompi_spc_event_t ompi_spc_events_names[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_SEND, "The number of times MPI_Send was called."), SET_COUNTER_ARRAY(OMPI_SPC_BSEND, "The number of times MPI_Bsend was called."), @@ -140,14 +154,14 @@ static ompi_spc_event_t ompi_spc_events_names[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_UNEXPECTED, "The number of messages that arrived as unexpected messages."), SET_COUNTER_ARRAY(OMPI_SPC_OUT_OF_SEQUENCE, "The number of messages that arrived out of the proper sequence."), SET_COUNTER_ARRAY(OMPI_SPC_OOS_QUEUE_HOPS, "The number of times we jumped to the next element in the out of sequence message queue's ordered list."), - SET_COUNTER_ARRAY(OMPI_SPC_MATCH_TIME, "The number of microseconds spent matching unexpected messages. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), - SET_COUNTER_ARRAY(OMPI_SPC_MATCH_QUEUE_TIME, "The number of microseconds spent inserting unexpected messages into the unexpected message queue. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), + SET_COUNTER_ARRAY(OMPI_SPC_MATCH_TIME, "The amount of time (MPI_T reports microseconds) spent matching unexpected messages. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), + SET_COUNTER_ARRAY(OMPI_SPC_MATCH_QUEUE_TIME, "The amount of time (MPI_T reports microseconds, stored in cycles) spent inserting unexpected messages into the unexpected message queue. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), + SET_COUNTER_ARRAY(OMPI_SPC_OOS_MATCH_TIME, "The amount of time (MPI_T reports microseconds, stored in cycles) spent matching out-of-sequence messages. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), + SET_COUNTER_ARRAY(OMPI_SPC_OOS_MATCH_QUEUE_TIME, "The amount of time (MPI_T reports microseconds, stored in cycles) spent inserting out-of-sequence messages into the unexpected message queue. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), SET_COUNTER_ARRAY(OMPI_SPC_UNEXPECTED_IN_QUEUE, "The number of messages that are currently in the unexpected message queue(s) of an MPI process."), SET_COUNTER_ARRAY(OMPI_SPC_OOS_IN_QUEUE, "The number of messages that are currently in the out of sequence message queue(s) of an MPI process."), - SET_COUNTER_ARRAY(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, "The maximum number of messages that the unexpected message queue(s) within an MPI process " - "contained at once since the last reset of this counter. Note: This counter is reset each time it is read."), - SET_COUNTER_ARRAY(OMPI_SPC_MAX_OOS_IN_QUEUE, "The maximum number of messages that the out of sequence message queue(s) within an MPI process " - "contained at once since the last reset of this counter. Note: This counter is reset each time it is read."), + SET_COUNTER_ARRAY(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, "The maximum number of messages that the unexpected message queue(s) within an MPI process contained at once since the last reset of this counter. Note: This counter is reset each time it is read. Note: The OMPI_SPC_UNEXPECTED_IN_QUEUE counter must also be activated."), + SET_COUNTER_ARRAY(OMPI_SPC_MAX_OOS_IN_QUEUE, "The maximum number of messages that the out of sequence message queue(s) within an MPI process contained at once since the last reset of this counter. Note: This counter is reset each time it is read. Note: The OMPI_SPC_OOS_IN_QUEUE counter must also be activated."), SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_LINEAR, "The number of times the base broadcast used the linear algorithm."), SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_CHAIN, "The number of times the base broadcast used the chain algorithm."), SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_PIPELINE, "The number of times the base broadcast used the pipeline algorithm."), @@ -191,13 +205,21 @@ static ompi_spc_event_t ompi_spc_events_names[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_TWO_PROCS, "The number of times the base barrier used the two process algorithm."), SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_LINEAR, "The number of times the base barrier used the linear algorithm."), SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_TREE, "The number of times the base barrier used the tree algorithm."), - SET_COUNTER_ARRAY(OMPI_SPC_P2P_MESSAGE_SIZE, "This is a bin counter with two subcounters. The first is messages that are less than or equal to 12288 bytes and the second is those that are larger than 12288 bytes."), + SET_COUNTER_ARRAY(OMPI_SPC_P2P_MESSAGE_SIZE, "This is a bin counter with two subcounters. The first is messages that are less than or equal to mpi_spc_p2p_message_boundary bytes and the second is those that are larger than mpi_spc_p2p_message_boundary bytes."), SET_COUNTER_ARRAY(OMPI_SPC_EAGER_MESSAGES, "The number of messages that fall within the eager size."), SET_COUNTER_ARRAY(OMPI_SPC_NOT_EAGER_MESSAGES, "The number of messages that do not fall within the eager size."), - SET_COUNTER_ARRAY(OMPI_SPC_QUEUE_ALLOCATION, "The amount of memory allocated after runtime currently in use for temporary message queues like the unexpected message queue and the out of sequence message queue.") + SET_COUNTER_ARRAY(OMPI_SPC_QUEUE_ALLOCATION, "The amount of memory allocated after runtime currently in use for temporary message queues like the unexpected message queue and the out of sequence message queue."), + SET_COUNTER_ARRAY(OMPI_SPC_MAX_QUEUE_ALLOCATION, "The maximum amount of memory allocated after runtime at one point for temporary message queues like the unexpected message queue and the out of sequence message queue. Note: The OMPI_SPC_QUEUE_ALLOCATION counter must also be activated."), + SET_COUNTER_ARRAY(OMPI_SPC_UNEXPECTED_QUEUE_DATA, "The amount of memory currently in use for the unexpected message queue."), + SET_COUNTER_ARRAY(OMPI_SPC_MAX_UNEXPECTED_QUEUE_DATA, "The maximum amount of memory in use for the unexpected message queue. Note: The OMPI_SPC_UNEXPECTED_QUEUE_DATA counter must also be activated."), + SET_COUNTER_ARRAY(OMPI_SPC_OOS_QUEUE_DATA, "The amount of memory currently in use for the out-of-sequence message queue."), + SET_COUNTER_ARRAY(OMPI_SPC_MAX_OOS_QUEUE_DATA, "The maximum amount of memory in use for the out-of-sequence message queue. Note: The OMPI_SPC_OOS_QUEUE_DATA counter must also be activated.") }; -/* A bitmap to denote whether an event is activated (1) or not (0) */ +/* A bitmap to denote whether an event is activated (1) or not (0) + * This is not static beacuse it is needed in the recording macros + * for instrumentation. + */ OMPI_DECLSPEC uint32_t ompi_spc_attached_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; /* A bitmap to denote whether an event is timer-based (1) or not (0) */ static uint32_t ompi_spc_timer_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; @@ -206,11 +228,17 @@ static uint32_t ompi_spc_bin_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { /* A bitmap to denote whether an event is collective bin-based (1) or not (0) */ static uint32_t ompi_spc_collective_bin_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; -/* An array of event structures to store the event data (name and value) */ +/* A contiguous data structure for storing the counter values, rules, and bins */ void *ompi_spc_events = NULL; +/* An array of offset structures for indexing into the ompi_spc_events data */ static ompi_spc_offset_t ompi_spc_offsets[OMPI_SPC_NUM_COUNTERS] = {-1}; +/* A pointer into the ompi_spc_events data for the SPC values */ static ompi_spc_value_t *ompi_spc_values = NULL; +/* ############################################################## + * ################## SPC Bitmap Functions ###################### + * ############################################################## + */ static inline void SET_SPC_BIT(uint32_t* array, int32_t pos) { assert(pos < OMPI_SPC_NUM_COUNTERS); @@ -229,6 +257,70 @@ static inline void CLEAR_SPC_BIT(uint32_t* array, int32_t pos) array[pos / (8 * sizeof(uint32_t))] &= ~(1U << (pos % (8 * sizeof(uint32_t)))); } +/* Registers all of the SPC MCA parameter variables. If SPCs are enabled, this is called from + * ompi_mpi_params.c in its registration function. + */ +int ompi_spc_register_params() +{ + ompi_mpi_spc_attach_string = NULL; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_attach", + "A comma delimeted string listing the software-based performance counters (SPCs) to enable.", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_attach_string); + + ompi_mpi_spc_xml_string = NULL; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_xml_string", + "A string to add to SPC XML files for easier identification. The format will be: spc_data.[nodename].[jobid or spc_xml_string].[world_rank].xml", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_xml_string); + + ompi_mpi_spc_dump_enabled = false; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_dump_enabled", + "A boolean value for whether (true) or not (false) to enable dumping SPC counters in MPI_Finalize.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_dump_enabled); + + ompi_mpi_spc_mmap_enabled = false; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_mmap_enabled", + "A boolean value for whether (true) or not (false) to enable dumping SPC counters to an mmap'd file.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_mmap_enabled); + + ompi_mpi_spc_p2p_message_boundary = 12288; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_p2p_message_boundary", + "An integer value for determining the boundary for whether a message is small/large for point to point message size bin counter (<= this value is small).", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_p2p_message_boundary); + + ompi_mpi_spc_collective_message_boundary = 12288; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_collective_message_boundary", + "An integer value for determining the boundary for whether a message is small/large for collective bin counters (<= this value is small).", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_collective_message_boundary); + + ompi_mpi_spc_collective_comm_boundary = 64; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_collective_comm_boundary", + "An integer value for determining the boundary for whether a communicator is small/large for collective bin counters (<= this value is small).", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_collective_comm_boundary); + + return OMPI_SUCCESS; +} + /* ############################################################## * ################# Begin MPI_T Functions ###################### * ############################################################## @@ -277,9 +369,8 @@ static int ompi_spc_notify(mca_base_pvar_t *pvar, mca_base_pvar_event_t event, v *count = strlen(filename); break; } - if( IS_SPC_BIT_SET(ompi_spc_bin_event, index) ) { /* TODO: make sure this works */ - *count = *(int*)(ompi_spc_events+ompi_spc_offsets[OMPI_SPC_P2P_MESSAGE_SIZE].rules_offset); - printf("Count: %d\n", *count); + if( IS_SPC_BIT_SET(ompi_spc_bin_event, index) || IS_SPC_BIT_SET(ompi_spc_collective_bin_event, index) ) { + *count = *(int*)(ompi_spc_events+ompi_spc_offsets[index].rules_offset); } else { *count = 1; } @@ -341,15 +432,10 @@ static int ompi_spc_get_xml_filename(const struct mca_base_pvar_t *pvar, void *v return MPI_SUCCESS; } -/* ############################################################## - * ################# Begin SPC Functions ######################## - * ############################################################## - */ - /* This function returns the current count of an SPC counter that has been retistered * as an MPI_T pvar. The MPI_T index is not necessarily the same as the SPC index, * so we need to convert from MPI_T index to SPC index and then set the 'value' argument - * to the correct value for this pvar. + * to the correct value for this pvar. Watermark counters are also reset here. */ static int ompi_spc_get_count(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) __opal_attribute_unused__; @@ -367,8 +453,11 @@ static int ompi_spc_get_count(const struct mca_base_pvar_t *pvar, void *value, v /* If this is a bin-based counter, set 'value' to the array of bin values */ if( IS_SPC_BIT_SET(ompi_spc_bin_event, index) || IS_SPC_BIT_SET(ompi_spc_collective_bin_event, index) ) { - long long **bin_value = (long long**)value; - *bin_value = (long long*)(ompi_spc_events+ompi_spc_offsets[index].bins_offset); + long long *bin_value = (long long*)value; + int count = ((int*)(ompi_spc_events+ompi_spc_offsets[index].rules_offset))[0]; + for(int i = 0; i < count; i++) { + bin_value[i] = ((long long*)(ompi_spc_events+ompi_spc_offsets[index].bins_offset))[i]; + } return MPI_SUCCESS; } @@ -380,28 +469,47 @@ static int ompi_spc_get_count(const struct mca_base_pvar_t *pvar, void *value, v if( IS_SPC_BIT_SET(ompi_spc_timer_event, index) ) { *counter_value /= sys_clock_freq_mhz; } - /* If this is a high watermark counter, reset it after it has been read */ + /* STEP 4: If this is a high watermark counter, reset it after it has been read. + * Be sure to reset the counter to the current value of the counter it + * is tracking. + */ if(index == OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE) { ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_UNEXPECTED_IN_QUEUE]; } if(index == OMPI_SPC_MAX_OOS_IN_QUEUE) { ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_OOS_IN_QUEUE]; } + if(index == OMPI_SPC_MAX_QUEUE_ALLOCATION) { + ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_QUEUE_ALLOCATION]; + } + if(index == OMPI_SPC_MAX_UNEXPECTED_QUEUE_DATA) { + ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_UNEXPECTED_QUEUE_DATA]; + } + if(index == OMPI_SPC_MAX_OOS_QUEUE_DATA) { + ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_OOS_QUEUE_DATA]; + } return MPI_SUCCESS; } +/* ############################################################## + * ################# Begin SPC Functions ######################## + * ############################################################## + */ + /* Initializes the events data structure and allocates memory for it if needed. */ void ompi_spc_events_init(void) { ompi_comm_dup(&ompi_mpi_comm_world.comm, &ompi_spc_comm); - int i, value_offset = 0, bin_offset = OMPI_SPC_NUM_COUNTERS*sizeof(ompi_spc_value_t), rank = ompi_comm_rank(ompi_spc_comm), shm_fd, rc, ret; + int i, value_offset = 0, bin_offset = OMPI_SPC_NUM_COUNTERS*sizeof(ompi_spc_value_t), rank = ompi_comm_rank(ompi_spc_comm), shm_fd, rc, ret, mod; char filename[SPC_MAX_FILENAME], *shm_dir; void *ptr; - if(0 > rc) { - opal_show_help("help-mpi-runtime.txt", "spc: filename creation failure", true); + /* Make sure the bin offset is cache aligned to avoid false sharing */ + mod = bin_offset % SPC_CACHE_LINE; + if(mod != 0) { + bin_offset += SPC_CACHE_LINE - mod; } FILE *fptr, *shm_fptr = NULL; @@ -433,6 +541,9 @@ void ompi_spc_events_init(void) rc = snprintf(filename, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%s.%d.xml", shm_dir, opal_process_info.nodename, ompi_mpi_spc_xml_string, rank); } + if (0 > rc) { + opal_show_help("help-mpi-runtime.txt", "spc: filename creation failure", true); + } fptr = fopen(filename, "w+"); /* Registers the name/path of the XML file as an MPI_T pvar */ @@ -442,7 +553,7 @@ void ompi_spc_events_init(void) MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, ompi_spc_get_xml_filename, NULL, ompi_spc_notify, NULL); if(ret < 0) { - printf("There was an error -> %s\n", opal_strerror(ret)); + opal_output(0, "There was an error registering an MPI_T pvar -> %s\n", opal_strerror(ret)); } @@ -450,6 +561,9 @@ void ompi_spc_events_init(void) fprintf(fptr, "\n"); } + /* STEP 3: Add specialized counter enumerations to the appropriate bitmap here. + * + */ /* ######################################################################## * ################## Add Timer Based Counter Enums Here ################## * ######################################################################## @@ -457,6 +571,8 @@ void ompi_spc_events_init(void) SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_TIME); SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_QUEUE_TIME); + SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_OOS_MATCH_TIME); + SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_OOS_MATCH_QUEUE_TIME); /* ############################################################################### * ###################### Put Bin Counter Sizes Here ############################# @@ -464,10 +580,6 @@ void ompi_spc_events_init(void) */ int data_size = OMPI_SPC_NUM_COUNTERS * sizeof(ompi_spc_value_t); - /* NOTE: If there are an odd number of bins, there could potentially be some false - * sharing with other counters, so make sure the data size is incremented by - * a multiple of cache line size (typically 8 bytes). - */ SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_P2P_MESSAGE_SIZE); ompi_spc_offsets[OMPI_SPC_P2P_MESSAGE_SIZE].num_bins = 2; data_size += 2 * (sizeof(int) + sizeof(ompi_spc_value_t)); @@ -602,6 +714,7 @@ void ompi_spc_events_init(void) SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_SCATTER_LINEAR); SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_SCATTER_LINEAR); +/* Form for adding new collective bin counters */ #if 0 /* X Algorithms */ /* Collective bin counter for the X algorithm */ @@ -632,26 +745,29 @@ void ompi_spc_events_init(void) } /* ############################################################################### - * ############################################################################### + * ############################# MMAP Initialization ############################# * ############################################################################### */ - - int bytes_needed = PAGE_SIZE * ((data_size + PAGE_SIZE - 1) % PAGE_SIZE); + uint page_size = opal_getpagesize(); + int bytes_needed = page_size * ((data_size + page_size - 1) % page_size); if(ompi_mpi_spc_mmap_enabled) { rc = opal_shmem_segment_create(&shm_ds, sm_file, bytes_needed); if (OPAL_SUCCESS != rc) { opal_show_help("help-mpi-runtime.txt", "spc: shm segment creation failure", true); + goto map_failed; } int default_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; shm_fd = open(sm_file, O_RDWR | O_CREAT | O_NONBLOCK, default_permissions); if(0 > shm_fd) { opal_show_help("help-mpi-runtime.txt", "spc: shm file open failure", true, strerror(errno)); + goto map_failed; } my_segment = opal_shmem_segment_attach(&shm_ds); if(NULL == my_segment) { opal_show_help("help-mpi-runtime.txt", "spc: shm attach failure", true); + goto map_failed; } } @@ -664,9 +780,10 @@ void ompi_spc_events_init(void) if(MAP_FAILED == (ompi_spc_events = mmap(0, bytes_needed, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0))) { opal_show_help("help-mpi-runtime.txt", "spc: mmap failure", true, strerror(errno)); map_failed: + mmap_failed = true; ompi_spc_events = NULL; /* mmap failed, so try malloc */ - if(NULL == (ompi_spc_events = malloc(data_size))) { + if(NULL == (ompi_spc_events = malloc(bytes_needed))) { opal_show_help("help-mpi-runtime.txt", "lib-call-fail", true, "malloc", __FILE__, __LINE__); spc_enabled = false; @@ -678,7 +795,10 @@ void ompi_spc_events_init(void) ompi_spc_values = (ompi_spc_value_t*)ompi_spc_events; - if(ompi_mpi_spc_mmap_enabled) { + /* Write the XML file header with the basic spc information. + * NOTE: If we failed to mmap the data file, there is no need to write the XML file. + */ + if(ompi_mpi_spc_mmap_enabled && !mmap_failed) { fprintf(fptr, "\t%s\n", sm_file); fprintf(fptr, "\t%d\n", OMPI_SPC_NUM_COUNTERS * sizeof(ompi_spc_t)); fprintf(fptr, "\t%d\n", OMPI_SPC_NUM_COUNTERS); @@ -692,7 +812,7 @@ void ompi_spc_events_init(void) ompi_spc_values[i] = 0; /* Add this counter to the XML document */ - if(ompi_mpi_spc_mmap_enabled) { + if(ompi_mpi_spc_mmap_enabled && !mmap_failed) { fprintf(fptr, "\t\n"); fprintf(fptr, "\t\t%s\n", ompi_spc_events_names[i].counter_name); fprintf(fptr, "\t\t%d\n", value_offset); @@ -705,22 +825,23 @@ void ompi_spc_events_init(void) ompi_spc_offsets[i].bins_offset = bin_offset; bin_offset += ompi_spc_offsets[i].num_bins*sizeof(ompi_spc_value_t); - int mod = bin_offset % CACHE_LINE; + /* Make sure the bin offset is cache aligned to avoid false sharing */ + mod = bin_offset % SPC_CACHE_LINE; if(mod != 0) { - bin_offset += CACHE_LINE - mod; + bin_offset += SPC_CACHE_LINE - mod; } } else { ompi_spc_offsets[i].rules_offset = -1; ompi_spc_offsets[i].bins_offset = -1; } - if(ompi_mpi_spc_mmap_enabled) { + if(ompi_mpi_spc_mmap_enabled && !mmap_failed) { fprintf(fptr, "\t\t%d\n", ompi_spc_offsets[i].rules_offset); fprintf(fptr, "\t\t%d\n", ompi_spc_offsets[i].bins_offset); fprintf(fptr, "\t\n"); } } - if(ompi_mpi_spc_mmap_enabled) { + if(ompi_mpi_spc_mmap_enabled && !mmap_failed) { fprintf(fptr, "\n"); fclose(fptr); } @@ -752,11 +873,11 @@ void ompi_spc_init(void) if(strcmp(arg_strings[0], "all") == 0) { all_on = 1; } + } else if(0 == num_args) { + goto no_counters; } for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { - /* Reset all timer-based counters */ - CLEAR_SPC_BIT(ompi_spc_timer_event, i); matched = all_on; if( !matched ) { @@ -789,7 +910,6 @@ void ompi_spc_init(void) } } if( (ret < 0) || (all_on && (ret != (mpi_t_offset + found - 1))) ) { - printf("ret -> %d\n", ret); mpi_t_enabled = false; opal_show_help("help-mpi-runtime.txt", "spc: MPI_T disabled", true); break; @@ -800,7 +920,10 @@ void ompi_spc_init(void) * ###################### Initialize Bin Counters Here #################### * ######################################################################## */ - + /* NOTE: These should be initialized even if they aren't currently turned on. + * This is just in case they are turned on mid-run through MPI_T. + */ + /* STEP 3a: Regular bin counters initialized here. */ int *rules = NULL; ompi_spc_value_t *bins = NULL; @@ -813,7 +936,7 @@ void ompi_spc_init(void) rules[1] = ompi_mpi_spc_p2p_message_boundary; /* The number after which counters go in the second bin */ /* Initialize Collective Bin Counters Here */ - int num_bins = 4; /* TODO: make these user-defined */ + int num_bins = 4; /* This can be expanded to be more flexible */ for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { if(IS_SPC_BIT_SET(ompi_spc_collective_bin_event,i)) { @@ -831,6 +954,7 @@ void ompi_spc_init(void) } } +no_counters: opal_argv_free(arg_strings); } @@ -1035,7 +1159,7 @@ void ompi_spc_bin_record(unsigned int event_id, ompi_spc_value_t value) OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[num_bins-1]), 1); } -/* Records an update to a counter using an atomic add operation. */ +/* Records an update to a collective bin counter using an atomic add operation. */ void ompi_spc_collective_bin_record(unsigned int event_id, ompi_spc_value_t bytes, ompi_spc_value_t procs) { int *rules; @@ -1098,17 +1222,25 @@ void ompi_spc_user_or_mpi(int tag, ompi_spc_value_t value, unsigned int user_enu * WARNING: This assumes that this function was called while a lock has already been taken. * This function is NOT thread safe otherwise! */ -void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum) +void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum, ompi_spc_value_t value) { - if(ompi_spc_values[value_enum] > ompi_spc_values[watermark_enum]) { - ompi_spc_values[watermark_enum] = ompi_spc_values[value_enum]; + OPAL_THREAD_ADD_FETCH_SIZE_T(&(ompi_spc_values[value_enum]), value); + if(IS_SPC_BIT_SET(ompi_spc_attached_event, watermark_enum)) { + if(ompi_spc_values[value_enum] > ompi_spc_values[watermark_enum]) { + ompi_spc_values[watermark_enum] = ompi_spc_values[value_enum]; + } } } +/* Gets the value of an SPC counter. + * + * WARNING: This function is not performed atomically and may not return the most up to date + * value in a threaded run. + */ ompi_spc_value_t ompi_spc_get_value(unsigned int event_id) { if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) { - return ompi_spc_values[event_id]; /* Note: this is not thread-safe */ + return ompi_spc_values[event_id]; } return 0; } diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 878bed5245c..15e5f7a6e88 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -35,31 +35,97 @@ #include "opal/util/output.h" #include "opal/mca/shmem/base/base.h" #include "opal/mca/pmix/pmix.h" +#include "opal/util/sys_limits.h" -#define PAGE_SIZE 4096 /* The number of bytes in a page. TODO: This should be found programatically */ -#define CACHE_LINE 8 /* The number of bytes in a cache line. TODO: This should be found programatically */ +#define SPC_CACHE_LINE opal_cache_line_size /* The number of bytes in a cache line. */ #define SPC_MAX_FILENAME PATH_MAX /* The maximum length allowed for the spc file strings */ #define SPC_SHM_DIR "/dev/shm" /* The default directory for shared memory files */ #include MCA_timer_IMPLEMENTATION_HEADER +/* MCA Parameter Variables */ +/** + * A comma delimited list of SPC counters to turn on or 'attach'. To turn + * all counters on, the string can be simply "all". An empty string will + * keep all counters turned off. + */ +OMPI_DECLSPEC extern char * ompi_mpi_spc_attach_string; + +/** + * A string to append to the SPC XML files for using the mmap interface. + * This is to make the filename easier to identify. + */ +OMPI_DECLSPEC extern char * ompi_mpi_spc_xml_string; + +/** + * A boolean value that determines whether or not to dump the SPC counter + * values in MPI_Finalize. A value of true dumps the counters and false does not. + */ +OMPI_DECLSPEC extern bool ompi_mpi_spc_dump_enabled; + +/** + * A boolean value that determines whether or not to dump the SPC counter + * values in an mmap'd file during execution. A value of true dumps the + * counters and false does not. + */ +OMPI_DECLSPEC extern bool ompi_mpi_spc_mmap_enabled; + +/** + * An integer value that denotes the time period between snapshots with the + * SPC mmap interface. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_snapshot_period; + +/** + * An integer value that denotes the boundary at which a message is qualified + * as a small/large message for the point to point message counter. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_p2p_message_boundary; + +/** + * An integer value that denotes the boundary at which a message is qualified + * as a small/large message for collective bin counters. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_collective_message_boundary; + +/** + * An integer value that denotes the boundary at which a communicator is qualified + * as a small/large communicator for collective bin counters. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_collective_comm_boundary; + /* INSTRUCTIONS FOR ADDING COUNTERS * 1.) Add a new counter name in the ompi_spc_counters_t enum before * OMPI_SPC_NUM_COUNTERS below. - * 2.) Add corresponding counter name and descriptions to the - * counter_names and counter_descriptions arrays in + * 2.) Add corresponding counter description(s) to the + * ompi_spc_events_names definition in * ompi_spc.c NOTE: The names and descriptions * MUST be in the same array location as where you added the * counter name in step 1. - * 3.) If this counter is based on a timer, add its enum name to - * the logic for timer-based counters in the ompi_spc_init - * function in ompi_spc.c - * 4.) Instrument the Open MPI code base where it makes sense for - * your counter to be modified using the SPC_RECORD macro. + * Search For: 'STEP 2' + * 3.) If this counter is a specialized counter like a timer, + * bin, or collective bin add its enum name to the logic for + * specialized counters in the ompi_spc_init function in ompi_spc.c + * Search For: 'STEP 3' + * NOTE: If this is a bin counter, and not a collective bin counter, + * you will need to initialize it. Search for: 'STEP 3a' + * 4.) If this counter is a watermark counter, additional logic is required + * for when this counter is read. The standard behavior of watermark + * counters is to keep track of updates to another counter and increase + * when that tracked counter exceeds the current value of the high + * watermark. These counters are reset to the current value of the + * tracked counter whenever they are read through MPI_T in the + * ompi_spc_get_count function in ompi_spc.c + * Search For: 'STEP 4' + * 5.) Instrument the Open MPI code where it makes sense for + * your counter to be modified using the appropriate SPC macro. + * This will typically be SPC_RECORD, but could be SPC_BIN_RECORD, + * SPC_COLL_BIN_RECORD, SPC_UPDATE_WATERMARK, or SPC_TIMER_START/STOP + * depending on the counter. * Note: If your counter is timer-based you should use the * SPC_TIMER_START and SPC_TIMER_STOP macros to record * the time in cycles to then be converted to microseconds later - * in the ompi_spc_get_count function when requested by MPI_T + * in the ompi_spc_get_count function when requested by MPI_T. */ /* This enumeration serves as event ids for the various events */ @@ -170,6 +236,8 @@ typedef enum ompi_spc_counters { OMPI_SPC_OOS_QUEUE_HOPS, OMPI_SPC_MATCH_TIME, OMPI_SPC_MATCH_QUEUE_TIME, + OMPI_SPC_OOS_MATCH_TIME, + OMPI_SPC_OOS_MATCH_QUEUE_TIME, OMPI_SPC_UNEXPECTED_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE, OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, @@ -221,6 +289,11 @@ typedef enum ompi_spc_counters { OMPI_SPC_EAGER_MESSAGES, OMPI_SPC_NOT_EAGER_MESSAGES, OMPI_SPC_QUEUE_ALLOCATION, + OMPI_SPC_MAX_QUEUE_ALLOCATION, + OMPI_SPC_UNEXPECTED_QUEUE_DATA, + OMPI_SPC_MAX_UNEXPECTED_QUEUE_DATA, + OMPI_SPC_OOS_QUEUE_DATA, + OMPI_SPC_MAX_OOS_QUEUE_DATA, OMPI_SPC_NUM_COUNTERS /* This serves as the number of counters. It must be last. */ } ompi_spc_counters_t; @@ -246,6 +319,9 @@ typedef struct ompi_spc_offset_s { int bins_offset; } ompi_spc_offset_t; +/* MCA Parameters Initialization Function */ +int ompi_spc_register_params(void); + /* Events data structure initialization function */ void ompi_spc_events_init(void); @@ -259,7 +335,7 @@ void ompi_spc_timer_start(unsigned int event_id, opal_timer_t *cycles); void ompi_spc_timer_stop(unsigned int event_id, opal_timer_t *cycles); void ompi_spc_user_or_mpi(int tag, ompi_spc_value_t value, unsigned int user_enum, unsigned int mpi_enum); void ompi_spc_cycles_to_usecs(ompi_spc_value_t *cycles); -void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum); +void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum, ompi_spc_value_t value); ompi_spc_value_t ompi_spc_get_value(unsigned int event_id); bool IS_SPC_BIT_SET(uint32_t* array, int32_t pos); @@ -295,15 +371,18 @@ bool IS_SPC_BIT_SET(uint32_t* array, int32_t pos); ompi_spc_timer_stop(event_id, usec) #define SPC_USER_OR_MPI(tag, value, enum_if_user, enum_if_mpi) \ - if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, enum_if_user) && IS_SPC_BIT_SET(ompi_spc_attached_event, enum_if_mpi)) ) \ + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, enum_if_user) || IS_SPC_BIT_SET(ompi_spc_attached_event, enum_if_mpi)) ) \ ompi_spc_user_or_mpi(tag, value, enum_if_user, enum_if_mpi) #define SPC_CYCLES_TO_USECS(cycles) \ ompi_spc_cycles_to_usecs(cycles) -#define SPC_UPDATE_WATERMARK(watermark_enum, value_enum) \ - if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, watermark_enum) && IS_SPC_BIT_SET(ompi_spc_attached_event, value_enum)) ) \ - ompi_spc_update_watermark(watermark_enum, value_enum) +/* WARNING: This macro assumes that it was called while a lock has already been taken. + * This function is NOT thread safe otherwise! + */ +#define SPC_UPDATE_WATERMARK(watermark_enum, value_enum, value) \ + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, value_enum)) ) \ + ompi_spc_update_watermark(watermark_enum, value_enum, value) #define SPC_GET(event_id) \ ompi_spc_get_value(event_id) @@ -337,7 +416,7 @@ bool IS_SPC_BIT_SET(uint32_t* array, int32_t pos); #define SPC_CYCLES_TO_USECS(cycles) \ ((void)0) -#define SPC_UPDATE_WATERMARK(watermark_enum, value_enum) \ +#define SPC_UPDATE_WATERMARK(watermark_enum, value_enum, value) \ ((void)0) #define SPC_GET(event_id) \ diff --git a/ompi/runtime/ompi_spc_documentation.md b/ompi/runtime/ompi_spc_documentation.md new file mode 100644 index 00000000000..6c488e275e9 --- /dev/null +++ b/ompi/runtime/ompi_spc_documentation.md @@ -0,0 +1,829 @@ +## What Are Software Performance Counters? +Software Performance Counters (SPCs) are modelled after [PAPI's](http://icl.utk.edu/papi/) hardware performance counters, but are designed to keep track of information that originates from software rather than hardware. The basic idea is to add instrumentation to a software package to keep track of performance metrics of interest and provide access to them in a portable way. The implementation of SPCs within Open MPI exposes information about the internal operation of Open MPI that would otherwise not be available to users and tool developers. These counters have been integrated with the MPI Tool Information Interface (MPI_T) as performance variables, or pvars, to make this information more readily available to tool developers and users. These counters can also be accessed through an mmap-based interface at runtime. + +## Building OMPI With SPC Support +By default, Open MPI does not build with SPCs enabled, so all of the instrumentation code becomes no-ops. Building Open MPI with SPCs is as simple as adding `--enable-spc` to your configure line. Once SPCs have been built in, there are several MCA parameters that are used to manage SPCs at runtime: + +- mpi_spc_attach: A string used to turn specific counters on. There are two reserved strings, 'all' and 'none', for turning on all and none of the counters respectively. Otherwise, this should be a comma-separated list of counters to turn on using the counters names. The default value is 'none'. +- mpi_spc_dump_enabled: A boolean parameter denoted by a 'true' or 'false' string that determines whether or not to print the counter values to stdout during MPI_Finalize. The default value is 'false' +- mpi_spc_mmap_enabled: A boolean parameter denoted by a 'true' or 'false' string that determines whether or not to use the mmap interface for storing the SPC data. The default value is 'false' +- mpi_spc_xml_string: A string that is appended to the XML file created by the mmap interface for easy identification. For example, if this variable is set to the value of 'test', the resultant XML filename would be spc_data.[nodename].test.[world rank].xml. By default this string is empty, which results in the 'test' value being replaced by the Open MPI jobid. +- orte_spc_snapshot_period: A floating point value denoting the amount of time in seconds after which to create a snapshot of the SPC values using the snapshot feature within the mmap interface. Any negative value will mean no snapshots will be taken. The default value is -1. +- mpi_spc_p2p_message boundary: An integer value denoting the point after which messages are determined to be 'large' messages within OMPI_SPC_P2P_MESSAGE_SIZE bin counter. The default value is 12288. +- mpi_spc_collective_message_boundary: An integer value denoting the point after which messages are determined to be 'large' messages for collective bin counters. The default value is 12288. +- mpi_spc_collective_comm_boundary: An integer value denoting the point after which a communicator is determined to be 'large' for collective bin counters. The default value is 64. + +Setting these MCA parameters can be done via the command line like so: + +`mpirun -np X --mca mpi_spc_attach OMPI_SPC_SEND,OMPI_SPC_RECV --mca mpi_spc_dump_enabled true ./your_app` + +These MCA parameters can also be set inside an mca-params.conf file like so: + +```bash +mpi_spc_attach = all +mpi_spc_dump_enabled = true +mpi_spc_mmap_enabled = true +mpi_spc_xml_string = myXMLstring +``` + +## Using SPCs Through MPI_T +All of the SPC counters are registered with MPI_T as pvars, which means that they can be easily accessed by tools. It is worth noting that if any of the SPCs fail to register with MPI_T, all counters are turned off with respect to MPI_T. This is a design decision to create a fast translation between MPI_T indices and SPC indices. + +The following is a simple example C program that will show how these counters could be used through MPI_T in practice. Essentially, this example sends some number of messages of some size both specified by the user from process rank 0 to rank 1. Rank 0 uses an MPI_T pvar to report the number of times the binomial algorithm was used for a broadcast under different conditions Small/Large communicator/message size, and rank 1 registers an MPI_T pvar to determine the number of bytes received through point to point communications. This example can also be found in the examples directory of the Open MPI repository. + +```c +/* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Simple example usage of SPCs through MPI_T. + */ + +#include +#include +#include +#include + +#include "mpi.h" + +/* Sends 'num_messages' messages of 'message_size' bytes from rank 0 to rank 1. + * All messages are send synchronously and with the same tag in MPI_COMM_WORLD. + */ +void message_exchange(int num_messages, int message_size) +{ + int i, rank; + /* Use calloc to initialize data to 0's */ + char *data = (char*)calloc(message_size, sizeof(char)); + MPI_Status status; + MPI_Request req; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + /* This is designed to have at least num_messages unexpected messages in order to + * hit the unexpected message queue counters. The broadcasts are here to showcase + * the collective bin counters and the P2P and Eager message counters. + */ + if(rank == 0) { + for(i = 0; i < num_messages; i++) { + MPI_Isend(data, message_size, MPI_BYTE, 1, 123, MPI_COMM_WORLD, &req); + } + MPI_Send(data, message_size, MPI_BYTE, 1, 321, MPI_COMM_WORLD); + for(i = 0; i < num_messages; i++) { + MPI_Bcast(data, message_size, MPI_BYTE, 0, MPI_COMM_WORLD); + } + } else if(rank == 1) { + MPI_Recv(data, message_size, MPI_BYTE, 0, 321, MPI_COMM_WORLD, &status); + for(i = 0; i < num_messages; i++) { + MPI_Recv(data, message_size, MPI_BYTE, 0, 123, MPI_COMM_WORLD, &status); + } + for(i = 0; i < num_messages; i++) { + MPI_Bcast(data, message_size, MPI_BYTE, 0, MPI_COMM_WORLD); + } + } + /* This should use the binomial algorithm so it has at least one counter value */ + MPI_Bcast(data, 1, MPI_BYTE, 0, MPI_COMM_WORLD); + + free(data); +} + +int main(int argc, char **argv) +{ + int num_messages, message_size; + + if(argc < 3) { + printf("Usage: mpirun -np 2 --mca mpi_spc_attach all --mca mpi_spc_dump_enabled true ./spc_example [num_messages] [message_size]\n"); + return -1; + } else { + num_messages = atoi(argv[1]); + message_size = atoi(argv[2]); + if(message_size <= 0) { + printf("Message size must be positive.\n"); + return -1; + } + } + + int i, j, rank, size, provided, num, name_len, desc_len, verbosity, bind, var_class, readonly, continuous, atomic, count, index, xml_index; + MPI_Datatype datatype; + MPI_T_enum enumtype; + MPI_Comm comm; + char name[256], description[256]; + + /* Counter names to be read by ranks 0 and 1 */ + char *counter_names[] = {"runtime_spc_OMPI_SPC_BASE_BCAST_BINOMIAL", + "runtime_spc_OMPI_SPC_BYTES_RECEIVED_USER" }; + char *xml_counter = "runtime_spc_OMPI_SPC_XML_FILE"; + + MPI_Init(NULL, NULL); + MPI_T_init_thread(MPI_THREAD_SINGLE, &provided); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + if(size != 2) { + fprintf(stderr, "ERROR: This test should be run with two MPI processes.\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + + if(rank == 0) { + printf("##################################################################\n"); + printf("This test is designed to highlight several different SPC counters.\n"); + printf("The MPI workload of this test will use 1 MPI_Send and %d MPI_Isend\n", num_messages); + printf("operation(s) on the sender side (rank 0) and %d MPI_Recv operation(s)\n", num_messages+1); + printf("on the receiver side (rank 1) in such a way that at least %d message(s)\n", num_messages); + printf("are unexpected. This highlights the unexpected message queue SPCs.\n"); + printf("There will also be %d MPI_Bcast operation(s) with one of them being of\n", num_messages+1); + printf("size 1 byte, and %d being of size %d byte(s). The 1 byte MPI_Bcast is\n", num_messages, message_size); + printf("meant to ensure that there is at least one MPI_Bcast that uses the\n"); + printf("binomial algorithm so the MPI_T pvar isn't all 0's. The addition of\n"); + printf("the broadcasts also has the effect of showcasing the P2P message size,\n"); + printf("eager vs not eager message, and bytes sent by the user vs MPI SPCs.\n"); + printf("Be sure to set the mpi_spc_dump_enabled MCA parameter to true in order\n"); + printf("to see all of the tracked SPCs.\n"); + printf("##################################################################\n\n"); + } + MPI_Barrier(MPI_COMM_WORLD); + + /* Determine the MPI_T pvar indices for the requested SPCs */ + index = xml_index = -1; + MPI_T_pvar_get_num(&num); + for(i = 0; i < num; i++) { + name_len = desc_len = 256; + PMPI_T_pvar_get_info(i, name, &name_len, &verbosity, + &var_class, &datatype, &enumtype, description, &desc_len, &bind, + &readonly, &continuous, &atomic); + + if(strcmp(name, xml_counter) == 0) { + xml_index = i; + printf("[%d] %s -> %s\n", rank, name, description); + } + if(strcmp(name, counter_names[rank]) == 0) { + index = i; + printf("[%d] %s -> %s\n", rank, name, description); + } + } + + /* Make sure we found the counters */ + if(index == -1 || xml_index == -1) { + fprintf(stderr, "ERROR: Couldn't find the appropriate SPC counter in the MPI_T pvars.\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + + int ret, xml_count; + long long *values = NULL; + char *xml_filename = (char*)malloc(128 * sizeof(char)); + + MPI_T_pvar_session session; + MPI_T_pvar_handle handle; + /* Create the MPI_T sessions/handles for the counters and start the counters */ + ret = MPI_T_pvar_session_create(&session); + ret = MPI_T_pvar_handle_alloc(session, index, NULL, &handle, &count); + ret = MPI_T_pvar_start(session, handle); + + values = (long long*)malloc(count * sizeof(long long)); + + MPI_T_pvar_session xml_session; + MPI_T_pvar_handle xml_handle; + if(xml_index >= 0) { + ret = MPI_T_pvar_session_create(&xml_session); + ret = MPI_T_pvar_handle_alloc(xml_session, xml_index, NULL, &xml_handle, &xml_count); + ret = MPI_T_pvar_start(xml_session, xml_handle); + } + + double timer = MPI_Wtime(); + message_exchange(num_messages, message_size); + timer = MPI_Wtime() - timer; + + printf("[%d] Elapsed time: %lf seconds\n", rank, timer); + + ret = MPI_T_pvar_read(session, handle, values); + if(xml_index >= 0) { + ret = MPI_T_pvar_read(xml_session, xml_handle, &xml_filename); + } + + /* Print the counter values in order by rank */ + for(i = 0; i < 2; i++) { + printf("\n"); + if(i == rank) { + if(xml_index >= 0) { + printf("[%d] XML Counter Value Read: %s\n", rank, xml_filename); + } + for(j = 0; j < count; j++) { + printf("[%d] %s Counter Value Read: %lld\n", rank, counter_names[rank], values[j]); + } + fflush(stdout); + } + MPI_Barrier(MPI_COMM_WORLD); + } + /* Stop the MPI_T session, free the handle, and then free the session */ + ret = MPI_T_pvar_stop(session, handle); + ret = MPI_T_pvar_handle_free(session, &handle); + ret = MPI_T_pvar_session_free(&session); + + MPI_T_finalize(); + MPI_Finalize(); + + return 0; +} +``` + +### Using SPCs Through the mmap Interface + +There is another interface other than MPI_T for accessing SPCs. In order to use this interface, you need to set the MCA parameter `ompi_mpi_spc_mmap_enabled` to true. This method uses mmap to create a memory region in which you can directly access SPCs after using mmap on a certain file. All the information necessary to attach to the correct file is dumped in an XML file for each rank in a shared system location (the default is /dev/shm) with the name `spc_data.[nodename].[SPC XML String or Open MPI jobid].[world rank].xml`. This XML file will have the format shown in the example below with the filename and file size for performing the mmap, and the number of counters and the clock frequency in MHz for ease of parsing (the clock frequency is needed for converting timer counters to microseconds from cycles). Each counter tag will have a name, and three offsets. The three offsets denote how many bytes that information is offset into the counter data. All counters have a value field, but only bin counters have an offset for the rules and bins (If the counter doesn't have bins, the values in these fields will be -1). + +#### XML File Example +```xml + + + /dev/shm/spc_data.c00.-860356607.0 + 5248 + 164 + 2127 + + OMPI_SPC_SEND + 0 + -1 + -1 + + + OMPI_SPC_BSEND + 8 + -1 + -1 + + + ... + + ... + +``` + +#### mmap Interface Usage in C +```c +/* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Simple example usage of SPCs through an mmap'd file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* This structure will help us store all of the offsets for each + * counter that we want to print out. + */ +typedef struct spc_s { + char name[128]; + int offset; + int rules_offset; + int bins_offset; +} spc_t; + +int main(int argc, char **argv) +{ + if(argc < 4) { + printf("Usage: ./spc_mmap_test [num_messages] [message_size] [XML string]\n"); + return -1; + } + + MPI_Init(NULL, NULL); + + int i, num_messages = atoi(argv[1]), message_size = atoi(argv[2]), rank, shm_fd; + char *buf = (char*)malloc(message_size * sizeof(char)); + + MPI_Request *requests = (MPI_Request*)malloc(num_messages * sizeof(MPI_Request)); + MPI_Status *statuses = (MPI_Status*)malloc(num_messages * sizeof(MPI_Status)); + MPI_Status status; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int retval, shm_file_size, num_counters, freq_mhz; + long long value; + char filename[128], shm_filename[128], line[128], *token; + + char hostname[128]; + gethostname(hostname, 128); + + char *nodename; + nodename = strtok(hostname, "."); + + char *xml_string = argv[3]; + snprintf(filename, 128, "/dev/shm/spc_data.%s.%s.%d.xml", nodename, xml_string, rank); + + FILE *fptr = NULL; + void *data_ptr; + spc_t *spc_data; + + if(NULL == (fptr = fopen(filename, "r"))) { + printf("Couldn't open xml file.\n"); + MPI_Finalize(); + return -1; + } else { + printf("[%d] Successfully opened the XML file!\n", rank); + } + + /* The following is to read the formatted XML file to get the basic + * information we need to read the shared memory file and properly + * format some counters. + */ + char tmp_filename[128]; + fgets(line, 128, fptr); + fgets(line, 128, fptr); + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%s", shm_filename); + + if(rank == 0) { + printf("shm_filename: %s\n", shm_filename); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &shm_file_size); + if(rank == 0) { + printf("shm_file_size: %d\n", shm_file_size); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &num_counters); + if(rank == 0) { + printf("num_counters: %d\n", num_counters); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &freq_mhz); + if(rank == 0) { + printf("freq_mhz: %d\n", freq_mhz); + } + + if(-1 == (shm_fd = open(shm_filename, O_RDONLY))){ + printf("\nCould not open file '%s'... Error String: %s\n", shm_filename, strerror(errno)); + return -1; + } else { + if(MAP_FAILED == (data_ptr = mmap(0, 8192, PROT_READ, MAP_SHARED, shm_fd, 0))) { + printf("Map failed :(\n"); + return -1; + } + printf("Successfully mmap'd file!\n"); + } + + spc_data = (spc_t*)malloc(num_counters * sizeof(spc_t)); + + for(i = 0; i < num_counters; i++) { + fgets(line, 128, fptr); /* Counter begin header */ + /* This should never happen... */ + if(strcmp(line,"\n") == 0) { + printf("Parsing ended prematurely. There weren't enough counters.\n"); + break; + } + + fgets(line, 128, fptr); /* Counter name header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%s", spc_data[i].name); /* Counter name */ + + fgets(line, 128, fptr); /* Counter value offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].offset); /* Counter offset */ + + fgets(line, 128, fptr); /* Counter rules offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].rules_offset); /* Counter rules offset */ + + fgets(line, 128, fptr); /* Counter bins offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].bins_offset); /* Counter bins offset */ + + fgets(line, 128, fptr); /* Counter end header */ + } + + fclose(fptr); + + /* The following communication pattern is intended to cause a certain + * number of unexpected messages. + */ + if(rank==0) { + for(i=num_messages; i > 0; i--) { + MPI_Isend(buf, message_size, MPI_BYTE, 1, i, MPI_COMM_WORLD, &requests[i-1]); + } + MPI_Send(buf, message_size, MPI_BYTE, 1, 0, MPI_COMM_WORLD); + MPI_Waitall(num_messages, requests, statuses); + + MPI_Barrier(MPI_COMM_WORLD); + + for(i = 0; i < num_counters; i++) { + if((0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_TIME")) || (0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_QUEUE_TIME"))) { + value = (*((long long*)(data_ptr+spc_data[i].offset))) / freq_mhz; + } else { + value = *((long long*)(data_ptr+spc_data[i].offset)); + } + if(value > 0) + printf("[%d] %s\t%lld\n", rank, spc_data[i].name, value ); + } + MPI_Barrier(MPI_COMM_WORLD); + } else { + MPI_Recv(buf, message_size, MPI_BYTE, 0, 0, MPI_COMM_WORLD, &status); + for(i=0; i < num_messages; i++) { + MPI_Recv(buf, message_size, MPI_BYTE, 0, i+1, MPI_COMM_WORLD, &statuses[i]); + } + + MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + for(i = 0; i < num_counters; i++) { + /* These counters are stored in cycles, so we convert them to microseconds. + */ + if((0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_TIME")) || (0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_QUEUE_TIME"))) { + value = (*((long long*)(data_ptr+spc_data[i].offset))) / freq_mhz; + } else { + value = *((long long*)(data_ptr+spc_data[i].offset)); + } + if(value > 0) { + printf("[%d] %s\t%lld\n", rank, spc_data[i].name, value ); + if(spc_data[i].rules_offset > 0) { + int j, *rules = (int*)(data_ptr+spc_data[i].rules_offset); + long long *bins = (long long*)(data_ptr+spc_data[i].bins_offset); + + for(j = 0; j < rules[0]; j++) { + if(j == rules[0]-1) { + printf("\t> %d\t", rules[j]); + } + else { + printf("\t<= %d\t", rules[j+1]); + } + printf("%lld\n", bins[j]); + } + } + } + } + } + + MPI_Finalize(); + + return 0; +} +``` + +#### Snapshot Feature in the mmap Interface +The mmap interface also allows for collecting snapshots of the SPC counter values periodically throughout an execution through a built-in snapshot feature. These snapshots use the 'orte_spc_snapshot_period' MCA parameter to determine the length of time after which to create a copy of the SPC data file from the mmap interface. The snapshot data file copies simply append a timestamp to the end of the mmap data file to keep track of when that snapshot was taken. These snapshot files can be used to show how counter values change over time. + +The following is an example python script that takes the values from these snapshot files and creates heatmaps of the change in the counter values over time. This example script takes three command line arguments: a directory where all of the snapshot, XML, and original data files are stored; the XML string or Open MPI jobid to identify these data and XML files; a comma-separated list of SPCs to be used in creating the heatmaps. + +```python +import sys +import glob +import operator +import struct + +import numpy as np +import matplotlib +matplotlib.use('Agg') # For use with headless systems +import matplotlib.pyplot as plt +import matplotlib.cm as cm +import matplotlib.ticker as ticker + +def combine(filename, data): + f = open(filename, 'rb') + for i in range(0,num_counters): + temp = struct.unpack('l', f.read(8))[0] + if 'TIME' in names[i]: + temp /= freq_mhz + data[i].append(temp) + +def fmt(x, pos): + return '{:,.0f}'.format(x) + +# Make sure the proper number of arguments have been supplied +if len(sys.argv) < 4: + print("Usage: ./spc_snapshot_parse.py [/path/to/data/files] [datafile_label] [list_of_spcs]") + exit() + +path = sys.argv[1] +label = sys.argv[2] + +xml_filename = '' +# Lists for storing the snapshot data files from each rank +copies = [] +ends = [] +# Populate the lists with the appropriate data files +for filename in glob.glob(path + "/spc_data*"): + if label in filename: + if xml_filename == '' and '.xml' in filename: + xml_filename = filename + if '.xml' not in filename: + temp = filename.split('/')[-1].split('.') + if len(temp) < 5: + temp[-1] = int(temp[-1]) + ends.append(temp) + else: + temp[-1] = int(temp[-1]) + temp[-2] = int(temp[-2]) + copies.append(temp) + +# Sort the lists +ends = sorted(ends, key = operator.itemgetter(-1)) +for i in range(0,len(ends)): + ends[i][-1] = str(ends[i][-1]) +copies = sorted(copies, key = operator.itemgetter(-2,-1)) +for i in range(0,len(copies)): + copies[i][-1] = str(copies[i][-1]) + copies[i][-2] = str(copies[i][-2]) + +sep = '.' + +xml_file = open(xml_filename, 'r') +num_counters = 0 +freq_mhz = 0 +names = [] +base = [] +# Parse the XML file (same for all data files) +for line in xml_file: + if 'num_counters' in line: + num_counters = int(line.split('>')[1].split('<')[0]) + if 'freq_mhz' in line: + freq_mhz = int(line.split('>')[1].split('<')[0]) + if '' in line: + names.append(line.split('>')[1].split('<')[0]) + value = [names[-1]] + base.append(value) + +prev = copies[0] +i = 0 +ranks = [] +values = [] +times = [] +time = [] + +# Populate the data lists +for n in range(0,len(base)): + values.append([0, names[n]]) +for c in copies: + if c[-2] != prev[-2]: + filename = path + "/" + sep.join(ends[i]) + combine(filename, values) + + ranks.append(values) + times.append(time) + for j in range(0, len(names)): + temp = [ranks[0][j][0]] + + values = [] + time = [] + for n in range(0,len(base)): + values.append([i+1, names[n]]) + i += 1 + + filename = path + "/" + sep.join(c) + time.append(int(filename.split('.')[-1])) + combine(filename, values) + prev = c + +filename = path + "/" + sep.join(ends[i]) +combine(filename, values) +ranks.append(values) +times.append(time) + +spc_list = sys.argv[3].split(",") + +for i in range(0, len(names)): + fig = plt.figure(num=None, figsize=(7, 9), dpi=200, facecolor='w', edgecolor='k') + + plot = False + # Only plot the SPCs of interest + if names[i] in spc_list: + plot = True + + map_data = [] + avg_x = [] + + for j in range(0, len(ranks)): + if avg_x == None: + avg_x = np.zeros(len(times[j])-1) + empty = True + for k in range(2,len(ranks[j][i])): + if ranks[j][i][k] != 0: + empty = False + break + if not empty: + if plot: + xvals = [] + yvals = [] + for l in range(1, len(times[j])): + if ranks[j][i][l+2] - ranks[j][i][l+1] < 0: + break + xvals.append(times[j][l] - times[j][0]) + yvals.append(ranks[j][i][l+2] - ranks[j][i][l+1]) + + map_data.append(yvals) + for v in range(0,len(avg_x)): + avg_x[v] += xvals[v] + if plot: + for v in range(0,len(avg_x)): + avg_x[v] /= float(len(ranks)) + + ax = plt.gca() + im = ax.imshow(map_data, cmap='Reds', interpolation='nearest') + + cbar = ax.figure.colorbar(im, ax=ax, format=ticker.FuncFormatter(fmt)) + cbar.ax.set_ylabel("Counter Value", rotation=-90, va="bottom") + + plt.title(names[i] + ' Snapshot Difference') + + plt.xlabel('Time') + plt.ylabel('MPI Rank') + + ax.set_xticks(np.arange(len(avg_x))) + ax.set_yticks(np.arange(len(map_data))) + ax.set_xticklabels(avg_x) + + plt.show() + fig.savefig(names[i] + '.png') +``` + +### List of Counters + +|Name |Description| +| --- | --------- | +|OMPI_SPC_SEND|The number of times MPI_Send was called.| +|OMPI_SPC_BSEND|The number of times MPI_Bsend was called.| +|OMPI_SPC_RSEND|The number of times MPI_Rsend was called.| +|OMPI_SPC_SSEND|The number of times MPI_Ssend was called.| +|OMPI_SPC_RECV|The number of times MPI_Recv was called.| +|OMPI_SPC_MRECV|The number of times MPI_Mrecv was called.| +|OMPI_SPC_ISEND|The number of times MPI_Isend was called.| +|OMPI_SPC_IBSEND|The number of times MPI_Ibsend was called.| +|OMPI_SPC_IRSEND|The number of times MPI_Irsend was called.| +|OMPI_SPC_ISSEND|The number of times MPI_Issend was called.| +|OMPI_SPC_IRECV|The number of times MPI_Irecv was called.| +|OMPI_SPC_SENDRECV|The number of times MPI_Sendrecv was called.| +|OMPI_SPC_SENDRECV_REPLACE|The number of times MPI_Sendrecv_replace was called.| +|OMPI_SPC_PUT|The number of times MPI_Put was called.| +|OMPI_SPC_RPUT|The number of times MPI_Rput was called.| +|OMPI_SPC_GET|The number of times MPI_Get was called.| +|OMPI_SPC_RGET|The number of times MPI_Rget was called.| +|OMPI_SPC_PROBE|The number of times MPI_Probe was called.| +|OMPI_SPC_IPROBE|The number of times MPI_Iprobe was called.| +|OMPI_SPC_BCAST|The number of times MPI_Bcast was called.| +|OMPI_SPC_IBCAST|The number of times MPI_Ibcast was called.| +|OMPI_SPC_BCAST_INIT|The number of times MPIX_Bcast_init was called.| +|OMPI_SPC_REDUCE|The number of times MPI_Reduce was called.| +|OMPI_SPC_REDUCE_SCATTER|The number of times MPI_Reduce_scatter was called.| +|OMPI_SPC_REDUCE_SCATTER_BLOCK|The number of times MPI_Reduce_scatter_block was called.| +|OMPI_SPC_IREDUCE|The number of times MPI_Ireduce was called.| +|OMPI_SPC_IREDUCE_SCATTER|The number of times MPI_Ireduce_scatter was called.| +|OMPI_SPC_IREDUCE_SCATTER_BLOCK|The number of times MPI_Ireduce_scatter_block was called.| +|OMPI_SPC_REDUCE_INIT|The number of times MPIX_Reduce_init was called.| +|OMPI_SPC_REDUCE_SCATTER_INIT|The number of times MPIX_Reduce_scatter_init was called.| +|OMPI_SPC_REDUCE_SCATTER_BLOCK_INIT|The number of times MPIX_Reduce_scatter_block_init was called.| +|OMPI_SPC_ALLREDUCE|The number of times MPI_Allreduce was called.| +|OMPI_SPC_IALLREDUCE|The number of times MPI_Iallreduce was called.| +|OMPI_SPC_ALLREDUCE_INIT|The number of times MPIX_Allreduce_init was called.| +|OMPI_SPC_SCAN|The number of times MPI_Scan was called.| +|OMPI_SPC_EXSCAN|The number of times MPI_Exscan was called.| +|OMPI_SPC_ISCAN|The number of times MPI_Iscan was called.| +|OMPI_SPC_IEXSCAN|The number of times MPI_Iexscan was called.| +|OMPI_SPC_SCAN_INIT|The number of times MPIX_Scan_init was called.| +|OMPI_SPC_EXSCAN_INIT|The number of times MPIX_Exscan_init was called.| +|OMPI_SPC_SCATTER|The number of times MPI_Scatter was called.| +|OMPI_SPC_SCATTERV|The number of times MPI_Scatterv was called.| +|OMPI_SPC_ISCATTER|The number of times MPI_Iscatter was called.| +|OMPI_SPC_ISCATTERV|The number of times MPI_Iscatterv was called.| +|OMPI_SPC_SCATTER_INIT|The number of times MPIX_Scatter_init was called.| +|OMPI_SPC_SCATTERV_INIT|The number of times MPIX_Scatterv_init was called.| +|OMPI_SPC_GATHER|The number of times MPI_Gather was called.| +|OMPI_SPC_GATHERV|The number of times MPI_Gatherv was called.| +|OMPI_SPC_IGATHER|The number of times MPI_Igather was called.| +|OMPI_SPC_IGATHERV|The number of times MPI_Igatherv was called.| +|OMPI_SPC_GATHER_INIT|The number of times MPIX_Gather_init was called.| +|OMPI_SPC_GATHERV_INIT|The number of times MPIX_Gatherv_init was called.| +|OMPI_SPC_ALLTOALL|The number of times MPI_Alltoall was called.| +|OMPI_SPC_ALLTOALLV|The number of times MPI_Alltoallv was called.| +|OMPI_SPC_ALLTOALLW|The number of times MPI_Alltoallw was called.| +|OMPI_SPC_IALLTOALL|The number of times MPI_Ialltoall was called.| +|OMPI_SPC_IALLTOALLV|The number of times MPI_Ialltoallv was called.| +|OMPI_SPC_IALLTOALLW|The number of times MPI_Ialltoallw was called.| +|OMPI_SPC_ALLTOALL_INIT|The number of times MPIX_Alltoall_init was called.| +|OMPI_SPC_ALLTOALLV_INIT|The number of times MPIX_Alltoallv_init was called.| +|OMPI_SPC_ALLTOALLW_INIT|The number of times MPIX_Alltoallw_init was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALL|The number of times MPI_Neighbor_alltoall was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALLV|The number of times MPI_Neighbor_alltoallv was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALLW|The number of times MPI_Neighbor_alltoallw was called.| +|OMPI_SPC_INEIGHBOR_ALLTOALL|The number of times MPI_Ineighbor_alltoall was called.| +|OMPI_SPC_INEIGHBOR_ALLTOALLV|The number of times MPI_Ineighbor_alltoallv was called.| +|OMPI_SPC_INEIGHBOR_ALLTOALLW|The number of times MPI_Ineighbor_alltoallw was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALL_INIT|The number of times MPIX_Neighbor_alltoall_init was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALLV_INIT|The number of times MPIX_Neighbor_alltoallv_init was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALLW_INIT|The number of times MPIX_Neighbor_alltoallw_init was called.| +|OMPI_SPC_ALLGATHER|The number of times MPI_Allgather was called.| +|OMPI_SPC_ALLGATHERV|The number of times MPI_Allgatherv was called.| +|OMPI_SPC_IALLGATHER|The number of times MPI_Iallgather was called.| +|OMPI_SPC_IALLGATHERV|The number of times MPI_Iallgatherv was called.| +|OMPI_SPC_ALLGATHER_INIT|The number of times MPIX_Allgather_init was called.| +|OMPI_SPC_ALLGATHERV_INIT|The number of times MPIX_Allgatherv_init was called.| +|OMPI_SPC_NEIGHBOR_ALLGATHER|The number of times MPI_Neighbor_allgather was called.| +|OMPI_SPC_NEIGHBOR_ALLGATHERV|The number of times MPI_Neighbor_allgatherv was called.| +|OMPI_SPC_INEIGHBOR_ALLGATHER|The number of times MPI_Ineighbor_allgather was called.| +|OMPI_SPC_INEIGHBOR_ALLGATHERV|The number of times MPI_Ineighbor_allgatherv was called.| +|OMPI_SPC_NEIGHBOR_ALLGATHER_INIT|The number of times MPIX_Neighbor_allgather_init was called.| +|OMPI_SPC_NEIGHBOR_ALLGATHERV_INIT|The number of times MPIX_Neighbor_allgatherv_init was called.| +|OMPI_SPC_TEST|The number of times MPI_Test was called.| +|OMPI_SPC_TESTALL|The number of times MPI_Testall was called.| +|OMPI_SPC_TESTANY|The number of times MPI_Testany was called.| +|OMPI_SPC_TESTSOME|The number of times MPI_Testsome was called.| +|OMPI_SPC_WAIT|The number of times MPI_Wait was called.| +|OMPI_SPC_WAITALL|The number of times MPI_Waitall was called.| +|OMPI_SPC_WAITANY|The number of times MPI_Waitany was called.| +|OMPI_SPC_WAITSOME|The number of times MPI_Waitsome was called.| +|OMPI_SPC_BARRIER|The number of times MPI_Barrier was called.| +|OMPI_SPC_IBARRIER|The number of times MPI_Ibarrier was called.| +|OMPI_SPC_BARRIER_INIT|The number of times MPIX_Barrier_init was called.| +|OMPI_SPC_WTIME|The number of times MPI_Wtime was called.| +|OMPI_SPC_CANCEL|The number of times MPI_Cancel was called.| +|OMPI_SPC_BYTES_RECEIVED_USER|The number of bytes received by the user through point-to-point communications. Note: Includes bytes transferred using internal RMA operations.| +|OMPI_SPC_BYTES_RECEIVED_MPI|The number of bytes received by MPI through collective| +|OMPI_SPC_BYTES_SENT_USER|The number of bytes sent by the user through point-to-point communications. Note: Includes bytes transferred using internal RMA operations.| +|OMPI_SPC_BYTES_SENT_MPI|The number of bytes sent by MPI through collective| +|OMPI_SPC_BYTES_PUT|The number of bytes sent/received using RMA Put operations both through user-level Put functions and internal Put functions.| +|OMPI_SPC_BYTES_GET|The number of bytes sent/received using RMA Get operations both through user-level Get functions and internal Get functions.| +|OMPI_SPC_UNEXPECTED|The number of messages that arrived as unexpected messages.| +|OMPI_SPC_OUT_OF_SEQUENCE|The number of messages that arrived out of the proper sequence.| +|OMPI_SPC_OOS_QUEUE_HOPS|The number of times we jumped to the next element in the out of sequence message queue's ordered list.| +|OMPI_SPC_MATCH_TIME|The amount of time | +|OMPI_SPC_MATCH_QUEUE_TIME|The amount of time | +|OMPI_SPC_OOS_MATCH_TIME|The amount of time | +|OMPI_SPC_OOS_MATCH_QUEUE_TIME|The amount of time | +|OMPI_SPC_UNEXPECTED_IN_QUEUE|The number of messages that are currently in the unexpected message queue| +|OMPI_SPC_OOS_IN_QUEUE|The number of messages that are currently in the out of sequence message queue| +|OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE|The maximum number of messages that the unexpected message queue| +|OMPI_SPC_MAX_OOS_IN_QUEUE|The maximum number of messages that the out of sequence message queue| +|OMPI_SPC_BASE_BCAST_LINEAR|The number of times the base broadcast used the linear algorithm.| +|OMPI_SPC_BASE_BCAST_CHAIN|The number of times the base broadcast used the chain algorithm.| +|OMPI_SPC_BASE_BCAST_PIPELINE|The number of times the base broadcast used the pipeline algorithm.| +|OMPI_SPC_BASE_BCAST_SPLIT_BINTREE|The number of times the base broadcast used the split binary tree algorithm.| +|OMPI_SPC_BASE_BCAST_BINTREE|The number of times the base broadcast used the binary tree algorithm.| +|OMPI_SPC_BASE_BCAST_BINOMIAL|The number of times the base broadcast used the binomial algorithm.| +|OMPI_SPC_BASE_REDUCE_CHAIN|The number of times the base reduce used the chain algorithm.| +|OMPI_SPC_BASE_REDUCE_PIPELINE|The number of times the base reduce used the pipeline algorithm.| +|OMPI_SPC_BASE_REDUCE_BINARY|The number of times the base reduce used the binary tree algorithm.| +|OMPI_SPC_BASE_REDUCE_BINOMIAL|The number of times the base reduce used the binomial tree algorithm.| +|OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE|The number of times the base reduce used the in order binary tree algorithm.| +|OMPI_SPC_BASE_REDUCE_LINEAR|The number of times the base reduce used the basic linear algorithm.| +|OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING|The number of times the base reduce scatter used the nonoverlapping algorithm.| +|OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING|The number of times the base reduce scatter used the recursive halving algorithm.| +|OMPI_SPC_BASE_REDUCE_SCATTER_RING|The number of times the base reduce scatter used the ring algorithm.| +|OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING|The number of times the base allreduce used the nonoverlapping algorithm.| +|OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING|The number of times the base allreduce used the recursive doubling algorithm.| +|OMPI_SPC_BASE_ALLREDUCE_RING|The number of times the base allreduce used the ring algorithm.| +|OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED|The number of times the base allreduce used the segmented ring algorithm.| +|OMPI_SPC_BASE_ALLREDUCE_LINEAR|The number of times the base allreduce used the linear algorithm.| +|OMPI_SPC_BASE_SCATTER_BINOMIAL|The number of times the base scatter used the binomial tree algorithm.| +|OMPI_SPC_BASE_SCATTER_LINEAR|The number of times the base scatter used the linear algorithm.| +|OMPI_SPC_BASE_GATHER_BINOMIAL|The number of times the base gather used the binomial tree algorithm.| +|OMPI_SPC_BASE_GATHER_LINEAR_SYNC|The number of times the base gather used the synchronous linear algorithm.| +|OMPI_SPC_BASE_GATHER_LINEAR|The number of times the base gather used the linear algorithm.| +|OMPI_SPC_BASE_ALLTOALL_INPLACE|The number of times the base alltoall used the in-place algorithm.| +|OMPI_SPC_BASE_ALLTOALL_PAIRWISE|The number of times the base alltoall used the pairwise algorithm.| +|OMPI_SPC_BASE_ALLTOALL_BRUCK|The number of times the base alltoall used the bruck algorithm.| +|OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC|The number of times the base alltoall used the synchronous linear algorithm.| +|OMPI_SPC_BASE_ALLTOALL_TWO_PROCS|The number of times the base alltoall used the two process algorithm.| +|OMPI_SPC_BASE_ALLTOALL_LINEAR|The number of times the base alltoall used the linear algorithm.| +|OMPI_SPC_BASE_ALLGATHER_BRUCK|The number of times the base allgather used the bruck algorithm.| +|OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING|The number of times the base allgather used the recursive doubling algorithm.| +|OMPI_SPC_BASE_ALLGATHER_RING|The number of times the base allgather used the ring algorithm.| +|OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE|The number of times the base allgather used the neighbor exchange algorithm.| +|OMPI_SPC_BASE_ALLGATHER_TWO_PROCS|The number of times the base allgather used the two process algorithm.| +|OMPI_SPC_BASE_ALLGATHER_LINEAR|The number of times the base allgather used the linear algorithm.| +|OMPI_SPC_BASE_BARRIER_DOUBLE_RING|The number of times the base barrier used the double ring algorithm.| +|OMPI_SPC_BASE_BARRIER_RECURSIVE_DOUBLING|The number of times the base barrier used the recursive doubling algorithm.| +|OMPI_SPC_BASE_BARRIER_BRUCK|The number of times the base barrier used the bruck algorithm.| +|OMPI_SPC_BASE_BARRIER_TWO_PROCS|The number of times the base barrier used the two process algorithm.| +|OMPI_SPC_BASE_BARRIER_LINEAR|The number of times the base barrier used the linear algorithm.| +|OMPI_SPC_BASE_BARRIER_TREE|The number of times the base barrier used the tree algorithm.| +|OMPI_SPC_P2P_MESSAGE_SIZE|This is a bin counter with two subcounters. The first is messages that are less than or equal to mpi_spc_p2p_message_boundary bytes and the second is those that are larger than mpi_spc_p2p_message_boundary bytes.| +|OMPI_SPC_EAGER_MESSAGES|The number of messages that fall within the eager size.| +|OMPI_SPC_NOT_EAGER_MESSAGES|The number of messages that do not fall within the eager size.| +|OMPI_SPC_QUEUE_ALLOCATION|The amount of memory allocated after runtime currently in use for temporary message queues like the unexpected message queue and the out of sequence message queue.| +|OMPI_SPC_MAX_QUEUE_ALLOCATION|The maximum amount of memory allocated after runtime at one point for temporary message queues like the unexpected message queue and the out of sequence message queue. Note: The OMPI_SPC_QUEUE_ALLOCATION counter must also be activated.| +|OMPI_SPC_UNEXPECTED_QUEUE_DATA|The amount of memory currently in use for the unexpected message queue.| +|OMPI_SPC_MAX_UNEXPECTED_QUEUE_DATA|The maximum amount of memory in use for the unexpected message queue. Note: The OMPI_SPC_UNEXPECTED_QUEUE_DATA counter must also be activated.| +|OMPI_SPC_OOS_QUEUE_DATA|The amount of memory currently in use for the out-of-sequence message queue.| +|OMPI_SPC_MAX_OOS_QUEUE_DATA|The maximum amount of memory in use for the out-of-sequence message queue. Note: The OMPI_SPC_OOS_QUEUE_DATA counter must also be activated.| diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index 302a03e3be5..9c06074fff4 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -26,6 +26,9 @@ #define OMPI_RUNTIME_PARAMS_H #include "ompi_config.h" +#if SPC_ENABLE == 1 +#include "ompi_spc.h" +#endif BEGIN_C_DECLS @@ -141,57 +144,6 @@ OMPI_DECLSPEC extern bool ompi_async_mpi_init; /* EXPERIMENTAL: do not perform an RTE barrier at the beginning of MPI_Finalize */ OMPI_DECLSPEC extern bool ompi_async_mpi_finalize; -/** - * A comma delimited list of SPC counters to turn on or 'attach'. To turn - * all counters on, the string can be simply "all". An empty string will - * keep all counters turned off. - */ -OMPI_DECLSPEC extern char * ompi_mpi_spc_attach_string; - -/** - * A string to append to the SPC XML files for using the mmap interface. - * This is to make the filename easier to identify. - */ -OMPI_DECLSPEC extern char * ompi_mpi_spc_xml_string; - -/** - * A boolean value that determines whether or not to dump the SPC counter - * values in MPI_Finalize. A value of true dumps the counters and false does not. - */ -OMPI_DECLSPEC extern bool ompi_mpi_spc_dump_enabled; - -/** - * A boolean value that determines whether or not to dump the SPC counter - * values in an mmap'd file during execution. A value of true dumps the - * counters and false does not. - */ -OMPI_DECLSPEC extern bool ompi_mpi_spc_mmap_enabled; - -/** - * An integer value that denotes the time period between snapshots with the - * SPC mmap interface. - */ -OMPI_DECLSPEC extern int ompi_mpi_spc_snapshot_period; - -/** - * An integer value that denotes the boundary at which a message is qualified - * as a small/large message for the point to point message counter. - */ -OMPI_DECLSPEC extern int ompi_mpi_spc_p2p_message_boundary; - -/** - * An integer value that denotes the boundary at which a message is qualified - * as a small/large message for collective bin counters. - */ -OMPI_DECLSPEC extern int ompi_mpi_spc_collective_message_boundary; - -/** - * An integer value that denotes the boundary at which a communicator is qualified - * as a small/large communicator for collective bin counters. - */ -OMPI_DECLSPEC extern int ompi_mpi_spc_collective_comm_boundary; - - /** * Register MCA parameters used by the MPI layer. * From 6fc0a12812841ddf7c3fc71d4bc2d49b1221f725 Mon Sep 17 00:00:00 2001 From: David Eberius Date: Mon, 11 May 2020 18:58:47 -0400 Subject: [PATCH 3/3] This update reflects the recent removal of orte in favor of prrte. This requires an update to the SPC driver code and documentation and necessitates moving the SPC snapshot feature to another code region which will be in a separate pull request. Signed-off-by: David Eberius --- examples/Makefile | 2 +- examples/spc_snapshot_parse.py | 7 +++++++ ompi/mca/coll/base/coll_base_allgather.c | 2 +- ompi/mca/coll/base/coll_base_allreduce.c | 2 +- ompi/mca/coll/base/coll_base_alltoall.c | 2 +- ompi/mca/coll/base/coll_base_barrier.c | 2 +- ompi/mca/coll/base/coll_base_bcast.c | 2 +- ompi/mca/coll/base/coll_base_gather.c | 2 +- ompi/mca/coll/base/coll_base_reduce.c | 2 +- ompi/mca/coll/base/coll_base_reduce_scatter.c | 2 +- ompi/mca/coll/base/coll_base_scatter.c | 2 +- ompi/mca/pml/ob1/pml_ob1_recvfrag.c | 2 +- ompi/mca/pml/ob1/pml_ob1_recvfrag.h | 2 +- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 2 +- ompi/mca/pml/ob1/pml_ob1_sendreq.c | 2 +- ompi/mca/pml/ob1/pml_ob1_sendreq.h | 2 +- ompi/runtime/Makefile.am | 1 - ompi/runtime/help-mpi-runtime.txt | 2 +- ompi/runtime/ompi_mpi_params.c | 2 +- ompi/runtime/ompi_spc.c | 3 +-- ompi/runtime/ompi_spc.h | 3 +-- ompi/runtime/ompi_spc_documentation.md | 16 +++++++++------- ompi/runtime/params.h | 2 +- 23 files changed, 36 insertions(+), 30 deletions(-) diff --git a/examples/Makefile b/examples/Makefile index f6699ded452..3ce2a6b07a8 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2018 The University of Tennessee and The University +# Copyright (c) 2004-2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/examples/spc_snapshot_parse.py b/examples/spc_snapshot_parse.py index 40d6f26b8e3..20e4f98bea5 100755 --- a/examples/spc_snapshot_parse.py +++ b/examples/spc_snapshot_parse.py @@ -1,5 +1,12 @@ #!/usr/bin/python +# Copyright (c) 2020 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# +# Simple example usage of SPC snapshots that creates a heatmap of +# SPC usage over time. + import sys import glob import operator diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c index c03a64c039a..aca3292d68f 100644 --- a/ompi/mca/coll/base/coll_base_allgather.c +++ b/ompi/mca/coll/base/coll_base_allgather.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index 1504a2ae9ed..e34d74f750b 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/coll/base/coll_base_alltoall.c b/ompi/mca/coll/base/coll_base_alltoall.c index 9d18e1ecc16..74ea98cff05 100644 --- a/ompi/mca/coll/base/coll_base_alltoall.c +++ b/ompi/mca/coll/base/coll_base_alltoall.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/coll/base/coll_base_barrier.c b/ompi/mca/coll/base/coll_base_barrier.c index 7707f126056..bc192085b4f 100644 --- a/ompi/mca/coll/base/coll_base_barrier.c +++ b/ompi/mca/coll/base/coll_base_barrier.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index 8b55c7e9cd2..fd7d8f7d564 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/coll/base/coll_base_gather.c b/ompi/mca/coll/base/coll_base_gather.c index a7f09f0c2f6..90780792a02 100644 --- a/ompi/mca/coll/base/coll_base_gather.c +++ b/ompi/mca/coll/base/coll_base_gather.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index dcdfaaa9221..e7b7590401d 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c index 92c6d997a70..9a938306ecd 100644 --- a/ompi/mca/coll/base/coll_base_reduce_scatter.c +++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/coll/base/coll_base_scatter.c b/ompi/mca/coll/base/coll_base_scatter.c index 90df9ab3116..ebe75c8a39e 100644 --- a/ompi/mca/coll/base/coll_base_scatter.c +++ b/ompi/mca/coll/base/coll_base_scatter.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 256460dddd4..59f2d5e3940 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2019 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h index 6d82fc8488b..53b813cbaeb 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2018 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index d2ec677dfee..0e5288a4135 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2019 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index b0ac50d461d..30cc48aaadb 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2019 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index 8a39f212990..e5fa56951d1 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/runtime/Makefile.am b/ompi/runtime/Makefile.am index 646056c4819..e452544844c 100644 --- a/ompi/runtime/Makefile.am +++ b/ompi/runtime/Makefile.am @@ -51,4 +51,3 @@ libompi_mpir_la_SOURCES = \ libompi_mpir_la_CFLAGS = $(MPIR_UNWIND_CFLAGS) lib@OMPI_LIBMPI_NAME@_la_LIBADD += libompi_mpir.la - diff --git a/ompi/runtime/help-mpi-runtime.txt b/ompi/runtime/help-mpi-runtime.txt index f2f62796004..621fe6995ec 100644 --- a/ompi/runtime/help-mpi-runtime.txt +++ b/ompi/runtime/help-mpi-runtime.txt @@ -3,7 +3,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University +# Copyright (c) 2004-2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index 27e346cb673..500152508fd 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2019 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index 48b12efc5f8..53edc1decde 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 The University of Tennessee and The University + * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -1109,7 +1109,6 @@ void ompi_spc_fini(void) int fd, rc; char sm_file[SPC_MAX_FILENAME]; char *shm_dir = SPC_SHM_DIR; - orte_proc_t *pptr; int rank = ompi_comm_rank(ompi_spc_comm); diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 15e5f7a6e88..e3a8f88979d 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 The University of Tennessee and The University + * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2018 Research Organization for Information Science @@ -34,7 +34,6 @@ #include "opal/util/show_help.h" #include "opal/util/output.h" #include "opal/mca/shmem/base/base.h" -#include "opal/mca/pmix/pmix.h" #include "opal/util/sys_limits.h" #define SPC_CACHE_LINE opal_cache_line_size /* The number of bytes in a cache line. */ diff --git a/ompi/runtime/ompi_spc_documentation.md b/ompi/runtime/ompi_spc_documentation.md index 6c488e275e9..7088c364de5 100644 --- a/ompi/runtime/ompi_spc_documentation.md +++ b/ompi/runtime/ompi_spc_documentation.md @@ -8,7 +8,6 @@ By default, Open MPI does not build with SPCs enabled, so all of the instrumenta - mpi_spc_dump_enabled: A boolean parameter denoted by a 'true' or 'false' string that determines whether or not to print the counter values to stdout during MPI_Finalize. The default value is 'false' - mpi_spc_mmap_enabled: A boolean parameter denoted by a 'true' or 'false' string that determines whether or not to use the mmap interface for storing the SPC data. The default value is 'false' - mpi_spc_xml_string: A string that is appended to the XML file created by the mmap interface for easy identification. For example, if this variable is set to the value of 'test', the resultant XML filename would be spc_data.[nodename].test.[world rank].xml. By default this string is empty, which results in the 'test' value being replaced by the Open MPI jobid. -- orte_spc_snapshot_period: A floating point value denoting the amount of time in seconds after which to create a snapshot of the SPC values using the snapshot feature within the mmap interface. Any negative value will mean no snapshots will be taken. The default value is -1. - mpi_spc_p2p_message boundary: An integer value denoting the point after which messages are determined to be 'large' messages within OMPI_SPC_P2P_MESSAGE_SIZE bin counter. The default value is 12288. - mpi_spc_collective_message_boundary: An integer value denoting the point after which messages are determined to be 'large' messages for collective bin counters. The default value is 12288. - mpi_spc_collective_comm_boundary: An integer value denoting the point after which a communicator is determined to be 'large' for collective bin counters. The default value is 64. @@ -33,7 +32,7 @@ The following is a simple example C program that will show how these counters co ```c /* - * Copyright (c) 2020 The University of Tennessee and The University + * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -89,7 +88,7 @@ void message_exchange(int num_messages, int message_size) int main(int argc, char **argv) { - int num_messages, message_size; + int num_messages, message_size, rc; if(argc < 3) { printf("Usage: mpirun -np 2 --mca mpi_spc_attach all --mca mpi_spc_dump_enabled true ./spc_example [num_messages] [message_size]\n"); @@ -148,9 +147,11 @@ int main(int argc, char **argv) MPI_T_pvar_get_num(&num); for(i = 0; i < num; i++) { name_len = desc_len = 256; - PMPI_T_pvar_get_info(i, name, &name_len, &verbosity, - &var_class, &datatype, &enumtype, description, &desc_len, &bind, - &readonly, &continuous, &atomic); + rc = PMPI_T_pvar_get_info(i, name, &name_len, &verbosity, + &var_class, &datatype, &enumtype, description, &desc_len, &bind, + &readonly, &continuous, &atomic); + if( MPI_SUCCESS != rc ) + continue; if(strcmp(name, xml_counter) == 0) { xml_index = i; @@ -484,7 +485,8 @@ int main(int argc, char **argv) ``` #### Snapshot Feature in the mmap Interface -The mmap interface also allows for collecting snapshots of the SPC counter values periodically throughout an execution through a built-in snapshot feature. These snapshots use the 'orte_spc_snapshot_period' MCA parameter to determine the length of time after which to create a copy of the SPC data file from the mmap interface. The snapshot data file copies simply append a timestamp to the end of the mmap data file to keep track of when that snapshot was taken. These snapshot files can be used to show how counter values change over time. +NOTE: This feature is currently in the process of being moved to an external tool, and will be unavailable until then. +The mmap interface also allows for collecting snapshots of the SPC counter values periodically throughout an execution through a snapshot feature. These snapshots use an MCA parameter to determine the length of time after which to create a copy of the SPC data file from the mmap interface. The snapshot data file copies simply append a timestamp to the end of the mmap data file to keep track of when that snapshot was taken. These snapshot files can be used to show how counter values change over time. The following is an example python script that takes the values from these snapshot files and creates heatmaps of the change in the counter values over time. This example script takes three command line arguments: a directory where all of the snapshot, XML, and original data files are stored; the XML string or Open MPI jobid to identify these data and XML files; a comma-separated list of SPCs to be used in creating the heatmaps. diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index 9c06074fff4..4700813ba49 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2019 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,