eddy16112 · bosilca · Nov 7, 2014 · Nov 14, 2014 · Apr 9, 2015 · Apr 22, 2015
diff --git a/configure.ac b/configure.ac
@@ -1357,6 +1357,10 @@ m4_ifdef([project_oshmem],
 
 opal_show_subtitle "Final output"
 
+if test "$OPAL_cuda_support" != "0"; then
+  AC_CONFIG_FILES([opal/datatype/cuda/Makefile])
+fi
+
 AC_CONFIG_FILES([
     Makefile
 

diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h
@@ -94,7 +94,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_datatype_t);
 /* Using set constant for padding of the DATATYPE handles because the size of
  * base structure is very close to being the same no matter the bitness.
  */
-#define PREDEFINED_DATATYPE_PAD (512)
+#define PREDEFINED_DATATYPE_PAD (1024)
 
 struct ompi_predefined_datatype_t {
     struct ompi_datatype_t dt;

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -37,11 +37,24 @@
 #include "ompi/mca/bml/base/base.h"
 #include "ompi/memchecker.h"
 
+#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/mca/btl/smcuda/btl_smcuda.h"
+
+#define CUDA_DDT_WITH_RDMA 1
+
 size_t mca_pml_ob1_rdma_cuda_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
     size_t size,
     mca_pml_ob1_com_btl_t* rdma_btls);
+
+int mca_pml_ob1_rdma_cuda_btl_register_data(
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t *pack_convertor, uint8_t pack_required, int32_t gpu_device);
+
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -56,16 +69,18 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                         mca_bml_base_btl_t* bml_btl,
                                         size_t size) {
     int rc;
-#if OPAL_CUDA_GDR_SUPPORT
-    /* With some BTLs, switch to RNDV from RGET at large messages */
-    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
-        (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
-        return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-    }
-#endif /* OPAL_CUDA_GDR_SUPPORT */
+    int32_t local_device = 0;
 
     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+    struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
     if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
+#if OPAL_CUDA_GDR_SUPPORT
+        /* With some BTLs, switch to RNDV from RGET at large messages */
+        if ((sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
+            sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        }
+#endif /* OPAL_CUDA_GDR_SUPPORT */
         unsigned char *base;
         opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
         /* Set flag back */
@@ -75,6 +90,13 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            base,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
+
+            rc = mca_common_cuda_get_device(&local_device);
+            if (rc != 0) {
+                opal_output(0, "Failed to get the GPU device ID, rc= %d\n", rc);
+                return rc;
+            }                                                                   
+            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, 0, local_device); 
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                      sendreq->req_send.req_bytes_packed);
             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@@ -92,7 +114,48 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         /* Do not send anything with first rendezvous message as copying GPU
          * memory into RNDV message is expensive. */
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-        rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        if ((mca_pml_ob1_rdma_cuda_avail(sendreq->req_endpoint) != 0) && 
+            (opal_datatype_cuda_kernel_support == 1) && 
+            (bml_btl->btl->btl_cuda_ddt_allow_rdma == 1)) {
+            unsigned char *base;
+            size_t buffer_size = 0;
+            if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
+                buffer_size = bml_btl->btl->btl_cuda_ddt_pipeline_size * bml_btl->btl->btl_cuda_ddt_pipeline_depth;
+            } else {
+                buffer_size = convertor->local_size;
+            }
+            base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            convertor->gpu_buffer_ptr = base;
+            convertor->gpu_buffer_size = buffer_size;
+            sendreq->req_send.req_bytes_packed = convertor->local_size;
+            opal_output(0, "malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n", base, convertor->local_size, bml_btl->btl->btl_cuda_ddt_pipeline_size, bml_btl->btl->btl_cuda_ddt_pipeline_depth);
+            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                           sendreq->req_endpoint,
+                                                                           base,
+                                                                           sendreq->req_send.req_bytes_packed,
+                                                                           sendreq->req_rdma))) {
+
+                rc = mca_common_cuda_get_device(&local_device);
+                if (rc != 0) {
+                    opal_output(0, "Failed to get the GPU device ID, rc=%d\n", rc);
+                    return rc;
+                }
+                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, 1, local_device); 
+
+                rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                         sendreq->req_send.req_bytes_packed);
+
+                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                    mca_pml_ob1_free_rdma_resources(sendreq);
+                }
+            } else {
+                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+            }
+
+
+        } else {
+            rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        }
     }
     return rc;
 }
@@ -152,6 +215,61 @@ size_t mca_pml_ob1_rdma_cuda_btls(
     return num_btls_used;
 }
 
+int mca_pml_ob1_rdma_cuda_btl_register_data(
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t *pack_convertor, uint8_t pack_required, int32_t gpu_device)
+{
+    uint32_t i;
+    for (i = 0; i < num_btls_used; i++) {
+        mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
+        mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)
+                ((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data));
+      //   printf("base %p\n", cuda_reg->base.base);
+      //   for (j = 0; j < MAX_IPC_EVENT_HANDLE; j++) {
+      //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
+      // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
+      //   }
+        cuda_reg->data.pack_required = pack_required;
+        cuda_reg->data.gpu_device = gpu_device;
+        cuda_reg->data.pack_convertor = pack_convertor;
+
+    }
+    return 0;
+}
+
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint)
+{
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
+    double weight_total = 0;
+    int num_btls_used = 0, n;
+
+    /* shortcut when there are no rdma capable btls */
+    if(num_btls == 0) {
+        return 0;
+    }
+
+    /* check to see if memory is registered */
+    for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
+            n++) {
+        mca_bml_base_btl_t* bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
+
+        if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
+            weight_total += bml_btl->btl_weight;
+            num_btls_used++;
+        }
+    }
+
+    /* if we don't use leave_pinned and all BTLs that already have this memory
+ *      * registered amount to less then half of available bandwidth - fall back to
+ *           * pipeline protocol */
+    if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
+        return 0;
+
+    return num_btls_used;
+}
+
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl)
 {

diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -649,8 +649,11 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
         if (mca_pml_ob1_cuda_need_buffers(recvreq, btl))
 #endif /* OPAL_CUDA_SUPPORT */
         {
-            mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
-            return;
+            /* need more careful check here */
+            if (! (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+                mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
+                return;    
+            }
         }
     }
 

diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -675,10 +675,26 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
                                                     MCA_PML_OB1_HDR_FLAGS_PIN);
     }
 
+#if OPAL_CUDA_SUPPORT
+    if ( (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+        sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == true) {
+            data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr;
+            printf("START RMDA data_ptr %p\n", data_ptr);
+        } else {
+            opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
+        }
+        /* Set flag back */
+        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+    } else {
+        opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
+    }
+#else
     /* at this time ob1 does not support non-contiguous gets. the convertor represents a
      * contiguous block of memory */
     opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
-
+#endif
+
     local_handle = sendreq->req_rdma[0].btl_reg;
 
     /* allocate an rdma fragment to keep track of the request size for use in the fin message */

diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
@@ -63,7 +63,7 @@ libdatatype_la_SOURCES = \
         opal_datatype_pack.c \
         opal_datatype_position.c \
         opal_datatype_resize.c \
-        opal_datatype_unpack.c
+        opal_datatype_unpack.c 
 
 libdatatype_la_LIBADD = libdatatype_reliable.la
 

diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
@@ -0,0 +1,60 @@
+@SET_MAKE@
+
+AM_CPPFLAGS = @common_cuda_CPPFLAGS@
+srcdir = @srcdir@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+
+NVCC       = nvcc
+ARCH       = @AR@
+ARCHFLAGS  = cr
+STLIB     ?= opal_datatype_cuda_kernel.a
+DYLIB     ?= opal_datatype_cuda_kernel.so
+EXTLIB     = -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/opal/.libs -lopen-pal -L/usr/local/cuda/lib -lcuda
+subdir     = opal/datatype/cuda
+
+CC = nvcc
+CFLAGS  = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -I$(top_builddir) -I$(top_srcdir) -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
+LDFLAGS = -shared --compiler-options '-fPIC @LDFLAGS@'
+
+SRC := \
+    opal_datatype_cuda.cu \
+    opal_datatype_pack_cuda_kernel.cu \
+    opal_datatype_pack_cuda_wrapper.cu \
+    opal_datatype_unpack_cuda_kernel.cu \
+    opal_datatype_unpack_cuda_wrapper.cu
+
+OBJ := $(SRC:.cu=.o)
+
+.PHONY: all clean cleanall
+
+all: Makefile $(STLIB) $(DYLIB)
+
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(STLIB): $(OBJ)
+	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
+	@RANLIB@ $@
+
+$(DYLIB): $(OBJ)
+	$(NVCC) $(LDFLAGS) $(EXTLIB) -o $(DYLIB) $(OBJ)
+
+%.o: %.cu
+	$(NVCC) $(CFLAGS) $(EXTLIB) $(INC) -c $< -o $@ 
+
+install: $(DYLIB)
+	cp -f $(DYLIB) @OMPI_WRAPPER_LIBDIR@/
+
+clean:
+	rm -f $(OBJ)
+
+cleanall: clean
+	rm -f $(STLIB) $(DYLIB)